使用正則表達式中提取的URL,我在下面的網址csharp-online 拍攝靈感來自於例如節目,並打算在此頁面檢索的所有網址alexa在.NET
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
namespace ExtractingUrls
{
class Program
{
static void Main(string[] args)
{
WebClient client = new WebClient();
const string url = "http://www.alexa.com/topsites/category/Top/Society/History/By_Topic/Science/Engineering_and_Technology";
string source = client.DownloadString(url);
//Console.WriteLine(Getvals(source));
string matchPattern =
@"<a.rel=""nofollow"".style=""font-size:0.8em;"".href=[""'](?<url>[^""^']+[.]*)[""'].class=""offsite"".*>(?<name>[^<]+[.]*)</a>";
foreach (Hashtable grouping in ExtractGroupings(source, matchPattern, true))
{
foreach (DictionaryEntry DE in grouping)
{
Console.WriteLine("Value = " + DE.Value);
Console.WriteLine("");
}
}
// End.
Console.ReadLine();
}
public static ArrayList ExtractGroupings(string source, string matchPattern, bool wantInitialMatch)
{
ArrayList keyedMatches = new ArrayList();
int startingElement = 1;
if (wantInitialMatch)
{
startingElement = 0;
}
Regex RE = new Regex(matchPattern, RegexOptions.Multiline);
MatchCollection theMatches = RE.Matches(source);
foreach (Match m in theMatches)
{
Hashtable groupings = new Hashtable();
for (int counter = startingElement; counter < m.Groups.Count; counter++)
{
// If we had just returned the MatchCollection directly, the
// GroupNameFromNumber method would not be available to use
groupings.Add(RE.GroupNameFromNumber(counter),
m.Groups[counter]);
}
keyedMatches.Add(groupings);
}
return (keyedMatches);
}
}
}
但在這裏我面臨一個問題,當我執行每個URL時會顯示三次,這是首先顯示整個定位標記,然後該URL顯示兩次。任何人都可以建議我應該在哪裏糾正,以便我可以讓每個URL只顯示一次。
** DO _not_解析HTML使用正則表達式** http://stackoverflow.com/questions/1732348/regex-match-open-tags-除了-xhtml-self-contained-tags – SLaks 2010-01-31 23:49:35
@SLacks:「有時候適合解析一個有限的,已知的HTML集合」 – 2010-02-06 01:12:01