如何使用Java與HTML解析器jsoup? Jsoup是一個很好的工具,用於解析網站(如果他們不依賴於javascript),同時使用CSS選擇器來獲取特定的HTML元素。
Java代碼
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class Scraper {
List<Category> categories = new ArrayList<>();
static class Category {
private int categoryNumber;
private String title;
public Category(int categoryNumber, String title) {
this.categoryNumber = categoryNumber;
this.title = title;
}
public int getCategoryNumber() {
return categoryNumber;
}
public String getTitle() {
return title;
}
}
public Scraper(){
categories.add(new Category(1, "CREW BOATS"));
categories.add(new Category(2, "TUG BOATS"));
categories.add(new Category(3, "AHT & AHTS"));
categories.add(new Category(4, "SUPPLY/UTILITY VESSELS"));
categories.add(new Category(5, "BARGES"));
categories.add(new Category(6, "MISCELLANEOUS"));
}
private void scrapeCategory(Category category){
System.out.println("\n"+category.getTitle()+"\n");
String searchUrl = "http://www.arena-offshore.com/iframe/list/index.php?category=" + category.getCategoryNumber() + "&page=";
int pageIndex=1;
Document doc;
while (true) {
try {
doc = Jsoup.connect(searchUrl + pageIndex)
.userAgent(
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36")
.referrer("http://www.arena-offshore.com/").get();
if (doc.select("#pu132").isEmpty()) { // no more results
break;
}
for (Element element : doc.select("#pu132")) {
String boat = element.select("[data-muse-uid=\"U159\"]").first().text(); //ID
boat += "\n\t\t" + element.select("a").first().attr("href"); //HREF
boat += "\n\t\t" + element.select("[data-muse-uid=\"U158\"]").get(0).text(); //TYPE
boat += "\n\t\t" + element.select("[data-muse-uid=\"U158\"]").get(1).text(); // LOCATION
boat += "\n\t\t" + element.select("[data-muse-uid=\"U153\"]").first().text(); // BRIEF DETAILS
System.out.println("\t"+boat);
}
pageIndex++;
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void scrapeAllCategories(){
for (Category category : categories) {
scrapeCategory(category);
}
}
public static void main(String[] args) {
new Scraper().scrapeAllCategories();
}
}
注:你需要download the jsoup core library和add it to your build path。
輸出
CREW BOATS
AR-C1002
http://www.arena-offshore.com/agent-boat-for-sale-AR-C1002.html
AGENT BOAT
EAST MED.
FOR SALE 14 X 4 X 1.9 (DEPTH)M, 2007 BUILT 630 BHP, 12 PERSONEL, IN EAST MED.
...
AR-C1000
http://www.arena-offshore.com/AR-KED.html?page=13
CREW BOAT
SOUTH AMERICA
FOR SALE 17 X 5 X 2.18M, 2009 BUILT 1200 BHP, IN SOUTH AMERICA
TUG BOATS
AR-KTK
http://www.arena-offshore.com/single-screw-tug-boat-AR-KTK.html
SINGLE SCREW
TURKEY
FOR SALE 1998 BUILT/ 2008 REBUILT 1000 HP/16 TBP
...
AHT & AHTS
AR-RZA
http://www.arena-offshore.com/AR-RZA.html
ANCHOR HANDLING TUG/TOWING
AFRICA
FOR SALE 36 X 10 X 4 M (MAX DRAFT) 4400 BHP/58 TBP
...
SUPPLY/UTILITY VESSELS
AR-U5001
http://www.arena-offshore.com/survey-vessel-in-south-east-asia-AR-U5001.html
SURVEY SUPPOT VESSEL
SOUTH EAST ASIA
FOR SALE 20 X 6 X 1.5 (DRAFT)M, 2012 BUILT, IRS CLASS 650 BHP, 50 M2 DECK SPACE
...
BARGES
AR-KLM
http://www.arena-offshore.com/AR-KLM.html
ACCOMMODATION
SHETLANDS
FOR CHARTER 1993 BUILT MAJOR CONVERSION 2004 AND 2013
...
MISCELLANEOUS
AR-SAA
http://www.arena-offshore.com/AR-SAA.html
SHALLOW DRAFT MPP/WORKBOAT
-
FOR CHARTER 2800 HP/37.3 TBP CERTIFIED
...
注:縮短輸出,而不是...
有印刷很多更多的結果。
您是否確實知道任何語言或您計劃使用哪些代碼運行代碼? –