-1
我正在嘗試創建一個程序,用於在文本文件中查找任何類型的URL,例如hxxp://www.testsite.com/images/logo.png。下面的代碼是我在嘗試擴充在線教程的時候(主要是在我的代碼之上引用的),但是我一直無法捕獲任何/所有的URL,包括那些嵌入在html標籤中的URL。我會很感激任何幫助或建議,我可以嘗試。謝謝。尋找URL的正則表達式
/* Reference: http://www.vogella.com/tutorials/JavaRegularExpressions/article.html
*/
package de.vogella.regex.weblinks;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LinkGetter {
//variables
private Pattern htmltag;
private Pattern link;
public LinkGetter() {
//Patterns.
htmltag = Pattern.compile("<a\\b[^>]*href=\"[^>]*>(.*?)</a>");
link = Pattern.compile(
"((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)");
}
public static void main(String[] args){
String filepath ="TestFile.rtf";
System.out.println(new LinkGetter().getLinks(filepath));
}
public List<String> getLinks(String filepath) {
List<String> links = new ArrayList<String>();
try {
FileReader FR = new FileReader("TestFile.rtf");
BufferedReader bufferedReader = new BufferedReader(FR);
String A;
StringBuilder builder = new StringBuilder();
while ((A = bufferedReader.readLine()) != null) {
builder.append(A);
}
Matcher tagmatch = htmltag.matcher(builder.toString());
while (tagmatch.find()) {
Matcher matcher = link.matcher(tagmatch.group());
matcher.find();
String link = matcher.group().replaceFirst("href=\"", "")
.replaceFirst("\">", "")
.replaceFirst("\"[\\s]?target=\"[a-zA-Z_0-9]*", "");
if (valid(link)) {
links.add(makeAbsolute(filepath, link));
}
bufferedReader.close();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return links;
}
private boolean valid(String A) {
if (A.matches("javascript:.*|mailto:.*")) {
return false;
}
return true;
}
private String makeAbsolute(String filepath, String link) {
if (link.matches("http://.*")) {
return link;
}
else if (link.matches("/.*") || link.matches(".*$[^/]")) {
return "/" + link;
throw new RuntimeException("Cannot make absolute. File: " + filepath
+ " Link " + link);
}
}
你真的需要給我們一些不起作用的測試用例。查看代碼以查找不起作用的輸入是[so]的主題。相關問題 - [什麼是最好的正則表達式來檢查一個字符串是否是一個有效的URL?](http://stackoverflow.com/questions/161738/what-is-the-best-regular-expression-to-check-如果-一個字符串,是一個有效的URL) – Dukeling