2012-12-31 59 views
0

任何人都可以使用這段代碼嗎? 我想下載這個html中的所有URL http://mises.org/books/(它們都是pdf的)試圖在html中下載所有URL

我理解了基本邏輯,我想我只是搞亂了正則表達式。這是我到目前爲止:

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Net; 
using System.IO; 
using System.Text.RegularExpressions; 

namespace DownloadPdfs 
{ 
    class Program 
    { 
     static void Main(string[] args) 
     { 
      StringBuilder sb = new StringBuilder(); 

      byte[] buf = new byte[8192]; 

      HttpWebRequest request = (HttpWebRequest) 
      WebRequest.Create("http://mises.org/books/"); 

      HttpWebResponse response = (HttpWebResponse) 
       request.GetResponse(); 

      Stream resStream = response.GetResponseStream(); 

      string tempString = null; 
      int count = 0; 

      do 
      { 
       count = resStream.Read(buf, 0, buf.Length); 

       if (count != 0) 
       { 
        tempString = Encoding.ASCII.GetString(buf, 0, count); 
        sb.Append(tempString); 
       } 
      } 
      while (count > 0); // any more data to read? 

      string html = sb.ToString(); 
      List<string> listoflinks = new List<string>(); 

      string input = html; 
      Regex rx = new Regex(@"(?<="")[^""]+(?="")|[^\s""]\S*"); 
      for (Match match = rx.Match(input); match.Success; match = match.NextMatch()) 
      { 
       listoflinks.Add(match.ToString()); 
      } 

      foreach (var v in listoflinks) 
      { 
       using (WebClient Client = new WebClient()) 
       { 
        Client.DownloadFile(v,v); 
       } 
      }   
     } 
    } 
} 
+0

我可以建議您使用HTML Agility Pack而不是http://htmlagilitypack.codeplex.com/嗎?這將使得在原始HTML文檔中找到鏈接變得更加容易。這裏的第一個例子展示瞭如何檢索所有鏈接:http://htmlagilitypack.codeplex.com/wikipage?標題=示例 – dash

+0

如果您使用的是html敏捷包,請查看以前的這個問題:http:// stackoverflow .com/questions/2248411/get-all-links-on-html- –

回答

1

請嘗試下面的代碼。該模式將匹配錨點的HREF屬性的值。

Regex rx = new Regex(@"href=""(?<Url>[^.""]+\.pdf)""",RegexOptions.IgnoreCase | RegexOptions.Multiline); 
for (Match match = rx.Match(input); match.Success; match = match.NextMatch()) 
{ 
    var link = match.Groups["Url"].Value; 
    listoflinks.Add(link); 
} 
+0

champ --------- –

0

使用庫來解析html,如HtmlAgilityPack。

public List<string> GetLinks(string html) 
{ 
    var htmlDoc = new HtmlDocument(); 
    htmlDoc.LoadHtml(html); 

    var linkNodes = htmlDoc.DocumentNode.SelectNodes("//a[@href]"); 

    if (linkNodes == null) 
    { 
     return new List<string>(); 
    } 

    var linkNodesWithLink = linkNodes.Where(x => x.Attributes.Contains("href")).ToList(); 

    var links = linkNodesWithLink.Select(x => x.Attributes["href"].Value) 
     .Where(x => !string.IsNullOrWhiteSpace(x)) 
     .Select(x => x.Trim()) 
     .ToList(); 
    links = links.Distinct().ToList(); 
    return links; 
}