2012-09-24 81 views
0

我有一個HTML文件,我想用Jsoup讀取並將結果導出到Excel工作表。在這個過程中,我想提取HTML文件中存在的所有圖像的鏈接(src)。Java:如何使用Jsoup庫爲Java提取HTML文件中的圖像鏈接

下面是我已經習慣了做同樣的代碼片段:

File myhtml = new File("D:\\Projects\\Java\\report.html"); 
      //get the string from the file myhtml 
      String str = getFileString(myhtml); 

      //getting the links to the images as in the html file 
      Document doc = Jsoup.parseBodyFragment(str); 
      Elements media = doc.select("[src]"); 

      //System.out.println(media.size()); 
      for(Element imageLink:media) 
      { 

       if(imageLink.tagName().equals("img")) 
        //storing the local link to image as global variable in imlink 
        P1.imlink = imageLink.attr("src").toString(); 
System.out.println(P1.imlink); 
      } 

     } 

我在HTML文件中的兩個圖像,我想您的鏈接。但是,我編寫的代碼僅顯示了文件中第一個圖像的鏈接。請幫我找出我的代碼中的錯誤!

回答

0

試試這個位置:

File f = new File("D:\\Projects\\Java\\report.html"); 

Document doc = Jsoup.parse(f, null, ""); // set proper Charset (2nd param) and BaseUri (3rd param) here 
Elements elements = doc.select("img[src]"); 

for(Element element : elements) 
{ 
    // Do something with your links here ... 
    System.out.println(element.attr("src")); 
} 

順便說一句。也許你的問題是你將鏈接存儲到全局變量中的部分。每次運行循環時都會覆蓋它。更好的解決方案是將鏈接存儲到列表中,或者在第一次打開後離開循環。

0
//Dom ex............ 

    import org.w3c.tidy.*; 

    import java.io.*; 

    import java.net.*; 

    import org.w3c.dom.*; 

    import java.util.*; 

    public class demo 

    { 

    public static void main(String arg[]) 

    { 

    try 

    { 

    InputStream input = new URL("http://www.southreels.com").openStream(); 

    Document document = new Tidy().parseDOM(input, null); 

    NodeList imgs = document.getElementsByTagName("img"); 

    List<String> srcs = new ArrayList<String>(); 

    for (int i = 0; i < imgs.getLength(); i++) { 

    srcs.add(imgs.item(i).getAttributes().getNamedItem("src").getNodeValue()); 

    } 

    int i=0; 

    for (String src: srcs) { 

    System.out.println(i+" "+src); 

    i++; 

    String file =System.getProperty("user.dir")+System.getProperty("file.separator"); 

    URL server = new URL(src); 

    HttpURLConnection connection = (HttpURLConnection)server.openConnection(); 

    InputStream is = connection.getInputStream(); 

    OutputStream os = new FileOutputStream(file+"demo"+i+".jpg"); 

    byte[] buffer = new byte[1024]; 

    int byteReaded = is.read(buffer); 

    while(byteReaded != -1) 

    { 

    os.write(buffer,0,byteReaded); 

    byteReaded = is.read(buffer); 

    } 

    os.close(); 

    } 

    } 

    catch(Exception e) 

    { 

    } 

    } 

    }