2014-04-11 24 views
0

中的indexOf總是返回負7不管我怎麼說,我將使用的網站http://www.columbusstate.edu拉從任何網頁的所有URL,具有的indexOf麻煩[功課]

import java.io.IOException; 
import java.io.PrintWriter; 
import java.util.ArrayList; 
import java.util.Arrays; 
public class WebCrawler 
{ 
    private static int linkCount = 0; 
    public static void main(String[] args) throws IOException 
    { 

實例變量

 ArrayList<String> links = new ArrayList<String>(); 
     System.out.println("Enter the website you would like to web crawl"); 
     Scanner input = new Scanner(System.in); 
     String address=input.next(); 

去網站

 URL locator = new URL(address); 
     Scanner in=new Scanner(locator.openStream()); 

     String str=""; 
     PrintWriter out=new PrintWriter("links.txt"); 

搜索網頁和拉麟ks,或者它應該反正。

 while(in.hasNextLine()) 
     { 
      str=in.next(); 
      if(str.contains("href=\"http://")) 
      { 
       linkCount++; 
       int start = str.indexOf("ht"); 
       int end = str.indexOf("/\""); 
       if(links.contains(str.substring(start, end))){ 

       } 
       else{ 
        links.add("Line Number "+linkCount+""+str.substring(start, end)); 
       } 
      } 
      else if(str.contains("href=\"https://")){ 
       linkCount++; 
       int start = str.indexOf("ht"); 
       int end = str.indexOf("://")+15; 
       if(links.contains(str.substring(start, end))){ 

       } 
       else{ 
        links.add("Line Number "+linkCount+""+str.substring(start, end)); 
       } 
      } 
     } 
     int num = links.size(); 
     System.out.println(num); 
     out.println("Number of links on this webpage is "+linkCount); 
     out.println("Links are:"); 
     for(int i = links.size()-1; i>0; i--){ 
      out.println(links.get(i)); 
     } 
     out.close(); 
    } 
} 
+0

當你通過它一步一個調試器或放在調試printlns會發生什麼的例子嗎? –

回答

0

如果你真的尋找一種方式來提取網頁鏈接那麼最好使用適當的HTML解析器比試圖做手工。這裏有JSOUP

import java.io.IOException; 
import java.util.List; 
import java.util.ArrayList; 

import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 
import org.jsoup.select.Elements; 

public class HTMLUtils { 
    private HTMLUtils() {} 

    public static List<String>extractLinks(String url) throws IOException { 
    final ArrayList<String> result = new ArrayList<String>(); 

    Document doc = Jsoup.connect(url).get(); 

    Elements links = doc.select("a[href]"); 

    // href ... 
    for (Element link : links) { 
     result.add(link.attr("abs:href")); 
     // result.add(link.text()); 
    } 
    return result; 
    } 


    public final static void main(String[] args) throws Exception{ 
    String site = "http://www.columbusstate.edu"; 
    List<String> links = HTMLUtils.extractLinks(site); 
    for (String link : links) { 
     System.out.println(link); 
    } 
    } 
} 
+0

如果我爲了真正的世界目的而這麼做,我們正在學習如何在cpsc 2中使用自定義例外,這是我的教授提出的任務。儘管謝謝你的建議! – user3525508

+0

@ user3525508沒問題 – RealHowTo