的Html敏捷包可以幫助你在這裏有兩點:
1)它更容易獲取XML處理指令,因爲它解析PI數據爲HTML,所以它會將其轉化爲屬性
2)HtmlDocument實現了IXPathNavigable,因此它可以通過.NET Xslt轉換引擎直接轉換。
這是一段有效的代碼。我不得不添加一個特定的XmlResover來正確處理Xslt轉換,但我認爲這是特定於這個skechers的情況。
public static void DownloadAndProcessXml(string url, string userAgent, string outputFilePath)
{
using (XmlTextWriter writer = new XmlTextWriter(outputFilePath, Encoding.UTF8))
{
DownloadAndProcessXml(url, userAgent, writer);
}
}
public static void DownloadAndProcessXml(string url, string userAgent, XmlWriter output)
{
UserAgentXmlUrlResolver resolver = new UserAgentXmlUrlResolver(url, userAgent);
// WebClient is an easy to use class.
using (WebClient client = new WebClient())
{
// download Xml doc. set User-Agent header or the site won't answer us...
client.Headers[HttpRequestHeader.UserAgent] = resolver.UserAgent;
HtmlDocument xmlDoc = new HtmlDocument();
xmlDoc.Load(client.OpenRead(url));
// determine xslt (note the xpath trick as Html Agility Pack does not support xml processing instructions)
string xsltUrl = xmlDoc.DocumentNode.SelectSingleNode("//*[name()='?xml-stylesheet']").GetAttributeValue("href", null);
// download Xslt doc
client.Headers[HttpRequestHeader.UserAgent] = resolver.UserAgent;
XslCompiledTransform xslt = new XslCompiledTransform();
xslt.Load(new XmlTextReader(client.OpenRead(url + xsltUrl)), new XsltSettings(true, false), null);
// transform Html/Xml doc into new Xml doc, easy as HtmlDocument implements IXPathNavigable
// note the use of a custom resolver to overcome this Xslt resolve requests
xslt.Transform(xmlDoc, null, output, resolver);
}
}
// This class is needed during transformation otherwise there are errors.
// This is probably due to this very specific Xslt file that needs to go back to the root document itself.
public class UserAgentXmlUrlResolver : XmlUrlResolver
{
public UserAgentXmlUrlResolver(string rootUrl, string userAgent)
{
RootUrl = rootUrl;
UserAgent = userAgent;
}
public string RootUrl { get; set; }
public string UserAgent { get; set; }
public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
{
WebClient client = new WebClient();
if (!string.IsNullOrEmpty(UserAgent))
{
client.Headers[HttpRequestHeader.UserAgent] = UserAgent;
}
return client.OpenRead(absoluteUri);
}
public override Uri ResolveUri(Uri baseUri, string relativeUri)
{
if ((relativeUri == "/") && (!string.IsNullOrEmpty(RootUrl)))
return new Uri(RootUrl);
return base.ResolveUri(baseUri, relativeUri);
}
}
你這樣稱呼它:
string url = "http://www.skechers.com/";
string ua = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
DownloadAndProcessXml(url, ua, "skechers.html");
如果您格式良好的XML,爲什麼要用HtmlAgilityPack呢? – Cameron 2011-03-21 23:50:16
我正在嘗試獲取頁面摘要,即頁面標題和元描述,以及頁面上的img srcs列表。我允許從網絡輸入任何有效的網址。因此,要回答您的問題,我並不總是有格式良好的XML,即使我這樣做,文檔標題和說明將格式不一致。 – 2011-03-21 23:57:42