我對此很新,所以請原諒任何無知。來自httpwebresponse的部分頁面源
我創建了我的第一個多線程應用程序,它的目的是製作大量webrequests,解析每個頁面源,並將結果存儲在表中以供進一步詢問。理論上可能有多達30-40000個請求,因此需要多線程。每個請求獲得一個線程。我認爲一切正常,除了我經常只能得到一個非常部分的頁面源。就好像StreamReader在讀取響應時被中斷一樣。我使用相同的請求轉到瀏覽器並獲取整個頁面。我認爲這可能與線程有關,儘管我認爲我仍在同步撥打電話。 (理想情況下,我想異步執行這些調用,但我不知道該怎麼做。)有沒有方法知道頁面源是否完整以確定是否再次請求?我相信我錯過了這裏的複雜性。任何代碼的幫助將不勝感激。
對不起格式化。以下是發出請求的類的代碼的一部分:
using System;
using System.Collections.Generic;
using System.Text;
using System.Data.Sql;
using System.Data.SqlClient;
using System.Threading;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace M4EverCrawler
{
public class DomainRun
{
public void Start()
{
new Thread(new ThreadStart(this.Run1)).Start();
new Thread(new ThreadStart(this.Run2)).Start();
new Thread(new ThreadStart(this.Run3)).Start();
}
public DomainRun(DNQueueManager dnq, ProxyQueueManager prxQ)
{
dnqManager = dnq;
ProxyManager = prxQ;
}
private DNQueueManager dnqManager;
private ProxyQueueManager ProxyManager;
public StagingQueue StagingQueue = new StagingQueue();
public MetricsQueueManager MQmanager = new MetricsQueueManager();
public CommitQueueManager CQmanager = new CommitQueueManager();
protected void Run1()
{
dnqManager.LoadDNs();
ProxyManager.LoadProxies();
while (true)
{
if (dnqManager.IsDNDavailable)
{
DomainData dnd = dnqManager.GetDND();
dnd.PageSource = CapturePage(dnd.DomainName);
StagingQueue.AddDN2Q(dnd);
}
Thread.Sleep(new Random().Next(20));
}
}
protected void Run2()
{
while (true)
{
if (StagingQueue.IsDNDavailable)
{
DomainData dnd = StagingQueue.GetDND();
MaxOutboundLinks = 3;
AvoidHttps = true;
InsideLinks = false;
VerifyBackLinks = true;
MQmanager.AddDN2Q(ParsePage(dnd));
foreach (string link in dnd.Hlinks)
{
DomainData dndLink = new DomainData(dnd.MainSeqno,link.ToString());
dndLink.ParentDomainName = dnd.DomainName;
dnd.PageSource = String.Empty;
MQmanager.AddDN2Q(dndLink);
}
}
Thread.Sleep(new Random().Next(20));
}
}
protected void Run3()
{
while (true)
{
if (MQmanager.IsDNDavailable)
{
DomainData dnd = MQmanager.GetDND();
RunAlexa(dnd);
RunCompete(dnd);
RunQuantcast(dnd);
CQmanager.AddDN2Q(dnd, MQmanager, 1000);
}
Thread.Sleep(new Random().Next(20));
}
}
private string CapturePage(string URIstring)
{
Uri myUri;
try
{
myUri = new Uri(URIstring);
}
catch (Exception URIex)
{
return String.Empty;
}
string proxyIP = ProxyManager.GetCurrentProxy() == "" ? ProxyManager.GetProxy() : ProxyManager.GetCurrentProxy();
int proxCtr = 0;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(myUri);
WebProxy Proxy = new WebProxy(proxyIP);
request.Proxy = Proxy;
request.Timeout = 20000;
try
{
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (StreamReader strmRdr = new StreamReader(response.GetResponseStream(), Encoding.ASCII))
{
return strmRdr.ReadToEnd();
}
}
}
catch (InvalidOperationException Wex)
{
. . .
}
}
非常感謝,feroze。我運用了你的建議,我的問題解決了。現在我正在處理異步Web調用。你或其他人的任何建議都會很棒。順便說一句,我喜歡這個網站。通過閱讀問答和評論,我學到了很多東西。爲所有貢獻的人歡呼! – JLP188