以上不適合我。我採取了設法把它放在一起。我第一次在4年內完成Java,所以我相信這可以得到改善。
public static boolean robotSafe(URL url)
{
String strHost = url.getHost();
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try { urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
// something weird is happening, so don't trust it
return false;
}
String strCommands;
try
{
InputStream urlRobotStream = urlRobot.openStream();
byte b[] = new byte[1000];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1) {
numRead = urlRobotStream.read(b);
if (numRead != -1)
{
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
}
catch (IOException e)
{
return true; // if there is no robots.txt file, it is OK to search
}
if (strCommands.contains(DISALLOW)) // if there are no "disallow" values, then they are not blocking anything.
{
String[] split = strCommands.split("\n");
ArrayList<RobotRule> robotRules = new ArrayList<>();
String mostRecentUserAgent = null;
for (int i = 0; i < split.length; i++)
{
String line = split[i].trim();
if (line.toLowerCase().startsWith("user-agent"))
{
int start = line.indexOf(":") + 1;
int end = line.length();
mostRecentUserAgent = line.substring(start, end).trim();
}
else if (line.startsWith(DISALLOW)) {
if (mostRecentUserAgent != null) {
RobotRule r = new RobotRule();
r.userAgent = mostRecentUserAgent;
int start = line.indexOf(":") + 1;
int end = line.length();
r.rule = line.substring(start, end).trim();
robotRules.add(r);
}
}
}
for (RobotRule robotRule : robotRules)
{
String path = url.getPath();
if (robotRule.rule.length() == 0) return true; // allows everything if BLANK
if (robotRule.rule == "/") return false; // allows nothing if/
if (robotRule.rule.length() <= path.length())
{
String pathCompare = path.substring(0, robotRule.rule.length());
if (pathCompare.equals(robotRule.rule)) return false;
}
}
}
return true;
}
而且你將需要輔助類:
/**
*
* @author Namhost.com
*/
public class RobotRule
{
public String userAgent;
public String rule;
RobotRule() {
}
@Override public String toString()
{
StringBuilder result = new StringBuilder();
String NEW_LINE = System.getProperty("line.separator");
result.append(this.getClass().getName() + " Object {" + NEW_LINE);
result.append(" userAgent: " + this.userAgent + NEW_LINE);
result.append(" rule: " + this.rule + NEW_LINE);
result.append("}");
return result.toString();
}
}
有什麼問題到底是什麼?解析robot.txt似乎超出了Jsoup的範圍。 Jsoup就像你自己說的那樣解析網頁。 – Darwind
謝謝,雅,我使用jsoup解析頁面......但要求是解析urls.txt中只允許(不受限制)的網址..併爲此驗證似乎JSoup不是最好的或不能。所以我需要知道的是,在進行實際解析之前,我如何才能在robots.txt上實現此驗證。 –
好吧,這很好。我正在尋找一個使用jsoup的小項目,所以我可以自己做。 – alkis