2015-04-20 79 views
-1

我需要檢查列表針對一個網站的許多代理。我決定使用libcurl來做到這一點。
我使用this的例子,並根據我的需要進行了修改。
這裏是我的代碼:cURL與多接口與代理的許多連接

#include <cstdio> 
#include <cstring> 
#include <fstream> 
#include <string> 
#include <iostream> 

#include <curl/curl.h> 

/* somewhat unix-specific */ 
#include <sys/time.h> 
#include <unistd.h> 

using namespace std; 

CURL * handles [100]; 

CURL * createProxyHandle (string proxyData){ 
    CURL * handle = curl_easy_init(); 

    curl_slist * chunk = NULL; 
    chunk = curl_slist_append(chunk, "Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1"); 
    chunk = curl_slist_append(chunk, "Accept-Language: ru-RU,ru;q=0.9,en;q=0.8"); 
    chunk = curl_slist_append(chunk, "Accept-Encoding: gzip, deflate, sdch"); 

    curl_easy_setopt (handle, CURLOPT_URL, "<site>"); 
    curl_easy_setopt (handle, CURLOPT_CONNECTTIMEOUT, 40); 
    curl_easy_setopt (handle, CURLOPT_TIMEOUT, 50); 
    curl_easy_setopt (handle, CURLOPT_FRESH_CONNECT, true); 
    curl_easy_setopt (handle, CURLOPT_VERBOSE, true); 
    curl_easy_setopt (handle, CURLOPT_FOLLOWLOCATION, true); 

    curl_easy_setopt (handle, CURLOPT_ENCODING , "gzip"); 
    curl_easy_setopt (handle, CURLOPT_PROXY, proxyData.c_str()); 
    curl_easy_setopt (handle, CURLOPT_HTTPHEADER, chunk); 
    curl_easy_setopt (handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36"); 

    return handle; 
} 

int main(){ 
    ifstream fin ("data.txt", ifstream::in); 
    string proxy; 

    CURLM *multi_handle; 
    CURLMsg *msg; 

    int msgs_left; 
    int still_running; 



    multi_handle = curl_multi_init(); 

    while (fin >> proxy){ 
     cout << "Proxy: " << proxy << endl; 
     CURL * handle = createProxyHandle (proxy); 
     curl_multi_add_handle(multi_handle, handle); 
    } 

    curl_multi_perform(multi_handle, &still_running); 

do { 
    struct timeval timeout; 
    int rc; /* select() return code */ 
    CURLMcode mc; /* curl_multi_fdset() return code */ 

    fd_set fdread; 
    fd_set fdwrite; 
    fd_set fdexcep; 
    int maxfd = -1; 

    long curl_timeo = -1; 

    FD_ZERO(&fdread); 
    FD_ZERO(&fdwrite); 
    FD_ZERO(&fdexcep); 

    /* set a suitable timeout to play around with */ 
    timeout.tv_sec = 1; 
    timeout.tv_usec = 0; 

    curl_multi_timeout(multi_handle, &curl_timeo); 
    if(curl_timeo >= 0) { 
     timeout.tv_sec = curl_timeo/1000; 
     if(timeout.tv_sec > 1) 
     timeout.tv_sec = 1; 
     else 
     timeout.tv_usec = (curl_timeo % 1000) * 1000; 
    } 

    /* get file descriptors from the transfers */ 
    mc = curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcep, &maxfd); 

    if(mc != CURLM_OK) 
    { 
     fprintf(stderr, "curl_multi_fdset() failed, code %d.\n", mc); 
     break; 
    } 

    /* On success the value of maxfd is guaranteed to be >= -1. We call 
     select(maxfd + 1, ...); specially in case of (maxfd == -1) there are 
     no fds ready yet so we call select(0, ...) --or Sleep() on Windows-- 
     to sleep 100ms, which is the minimum suggested value in the 
     curl_multi_fdset() doc. */ 

    if(maxfd == -1) { 
#ifdef _WIN32 
     Sleep(100); 
     rc = 0; 
#else 
     /* Portable sleep for platforms other than Windows. */ 
     struct timeval wait = { 0, 100 * 1000 }; /* 100ms */ 
     rc = select(0, NULL, NULL, NULL, &wait); 
#endif 
    } 
    else { 
     /* Note that on some platforms 'timeout' may be modified by select(). 
     If you need access to the original value save a copy beforehand. */ 
     rc = select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout); 
    } 

    switch(rc) { 
    case -1: 
     /* select error */ 
     break; 
    case 0: 
    default: 
     /* timeout or readable/writable sockets */ 
     curl_multi_perform(multi_handle, &still_running); 
     break; 
    } 
    } while(still_running); 

    while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { 
     if (msg->msg == CURLMSG_DONE) { 
      printf("Finished with %d\n", msg->data.result); 
     } 
     } 
    cout << "Completed" << endl; 
    curl_multi_cleanup(multi_handle); 

    return 0; 
} 

代理是不可靠的,但我在輸出中看到:

Proxy: 69.12.64.105:8089 
Proxy: 69.12.64.105:7808 
Proxy: 210.245.20.170:80 
Proxy: 190.74.165.109:8080 
Proxy: 39.184.2.111:8123 
Proxy: 190.201.166.37:8080 
Proxy: 190.36.85.199:8080 
Proxy: 92.255.231.54:8080 
Proxy: 124.126.126.101:80 
Proxy: 43.250.255.65:8080 
Proxy: 69.12.64.106:7808 
Proxy: 201.217.213.166:8080 
Proxy: 178.169.90.188:8888 
Proxy: 124.248.205.25:8080 
Proxy: 39.190.82.133:8123 
Proxy: 190.77.230.36:8080 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 69.12.64.105... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 69.12.64.105... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 210.245.20.170... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 190.74.165.109... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 39.184.2.111... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 190.201.166.37... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 190.36.85.199... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 92.255.231.54... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 124.126.126.101... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 43.250.255.65... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 69.12.64.106... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 201.217.213.166... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 178.169.90.188... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 124.248.205.25... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 39.190.82.133... 
* Rebuilt URL to: <site> 
* Hostname was NOT found in DNS cache 
* Trying 190.77.230.36... 
* Connected to 69.12.64.105 (69.12.64.105) port 8089 (#0) 
* Establish HTTP proxy tunnel to <site>:443 
> CONNECT <site>:443 HTTP/1.1 
Host: <site>:443 
User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 
Proxy-Connection: Keep-Alive 
Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 
Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 
Accept-Encoding: gzip, deflate, sdch 

< HTTP/1.1 503 Service Unavailable 
< Server: squid/3.2.13 
< Mime-Version: 1.0 
< Date: Mon, 20 Apr 2015 23:00:24 GMT 
< Content-Type: text/html 
< Content-Length: 3694 
< X-Squid-Error: ERR_DNS_FAIL 0 
< 
* Received HTTP code 503 from proxy after CONNECT 
* Connected to 69.12.64.105 (69.12.64.105) port 7808 (#1) 
* Establish HTTP proxy tunnel to <site>:443 
> CONNECT <site>:443 HTTP/1.1 
Host: <site>:443 
User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 
Proxy-Connection: Keep-Alive 
Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 
Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 
Accept-Encoding: gzip, deflate, sdch 

< HTTP/1.1 503 Service Unavailable 
< Server: squid/3.2.13 
< Mime-Version: 1.0 
< Date: Mon, 20 Apr 2015 23:00:25 GMT 
< Content-Type: text/html 
< Content-Length: 3694 
< X-Squid-Error: ERR_DNS_FAIL 0 
< 
* Received HTTP code 503 from proxy after CONNECT 
* Connected to 43.250.255.65 (43.250.255.65) port 8080 (#9) 
* Establish HTTP proxy tunnel to <site>:443 
> CONNECT <site>:443 HTTP/1.1 
Host: <site>:443 
User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 
Proxy-Connection: Keep-Alive 
Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 
Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 
Accept-Encoding: gzip, deflate, sdch 

< HTTP/1.1 200 OK 
< 
* Proxy replied OK to CONNECT request 
* found 173 certificates in /etc/ssl/certs/ca-certificates.crt 
* Connected to 69.12.64.106 (69.12.64.106) port 7808 (#10) 
* Establish HTTP proxy tunnel to <site>:443 
> CONNECT <site>:443 HTTP/1.1 
Host: <site>:443 
User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 
Proxy-Connection: Keep-Alive 
Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 
Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 
Accept-Encoding: gzip, deflate, sdch 

< HTTP/1.1 200 Connection established 
< 
* Proxy replied OK to CONNECT request 
* found 173 certificates in /etc/ssl/certs/ca-certificates.crt 
* Connected to 190.77.230.36 (190.77.230.36) port 8080 (#15) 
* Establish HTTP proxy tunnel to <site>:443 
> CONNECT <site>:443 HTTP/1.1 
Host: <site>:443 
User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 
Proxy-Connection: Keep-Alive 
Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 
Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 
Accept-Encoding: gzip, deflate, sdch 

< HTTP/1.0 200 Connection established 
< Proxy-agent: tinyproxy/1.8.2 
< 
* Proxy replied OK to CONNECT request 
* found 173 certificates in /etc/ssl/certs/ca-certificates.crt 
* Connected to 39.184.2.111 (39.184.2.111) port 8123 (#4) 
* Establish HTTP proxy tunnel to <site>:443 
> CONNECT <site>:443 HTTP/1.1 
Host: <site>:443 
User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 
Proxy-Connection: Keep-Alive 
Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 
Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 
Accept-Encoding: gzip, deflate, sdch 

* Proxy CONNECT aborted due to timeout 
* Connection time-out 
* Closing connection 5 
* Connection time-out 
* Closing connection 6 
* Connection time-out 
* Closing connection 7 
* Connection time-out 
* Closing connection 8 
* SSL connection timeout 
* Closing connection 9 
* SSL connection timeout 
* Closing connection 10 
* Connection time-out 
* Closing connection 11 
* Connection time-out 
* Closing connection 12 
* Connection time-out 
* Closing connection 13 
* Connection time-out 
* Closing connection 14 
* SSL connection timeout 
* Closing connection 15 
* Connection timed out after 50056 milliseconds 
* Connection timed out after 50055 milliseconds 
Finished with 56 
Finished with 56 
Finished with 56 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Finished with 28 
Completed 

在某些情況下(真的有很多代理的都是不好的,但不是所有的在此列表)curl從代理接收答案,發送標題,但沒有更多。我測試了這個代理分離,他們都沒問題。
我無法弄清楚捲曲多發生了什麼。

回答

0

在這個卷標documentation中對多接口有一些限制。我意識到,我已經使用這個受限制的功能:

  • NSS SSL連接
  • HTTP代理連接操作

至於解決這個問題我已經使用捲曲簡單的界面與POSIX線程和它運作良好。這是我的解決方案,只是這example與一些安全的多線程TLS使用的代碼:

#define USE_GNUTLS 

#include <cstdio> 
#include <pthread.h> 

#include <curl/curl.h> 

/* we have this global to let the callback get easy access to it */ 
static pthread_mutex_t *lockarray; 

#ifdef USE_GNUTLS 
#include <gcrypt.h> 
#include <errno.h> 

GCRY_THREAD_OPTION_PTHREAD_IMPL; 

void init_locks(void) 
{ 
    gcry_control(GCRYCTL_SET_THREAD_CBS); 
} 

#define kill_locks() 
#endif 

static void *pull_one_url(void *url) 
{ 
    FILE * file = fopen ("/dev/null", "w"); 

    CURL * handle = curl_easy_init(); 

    curl_slist * chunk = NULL; 
    chunk = curl_slist_append(chunk, "Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1"); 
    chunk = curl_slist_append(chunk, "Accept-Language: ru-RU,ru;q=0.9,en;q=0.8"); 
    chunk = curl_slist_append(chunk, "Accept-Encoding: gzip, deflate, sdch"); 

    curl_easy_setopt (handle, CURLOPT_URL, "https://www.avito.ru"); 
    curl_easy_setopt (handle, CURLOPT_CONNECTTIMEOUT, 30); 
    curl_easy_setopt (handle, CURLOPT_TIMEOUT, 30); 
    curl_easy_setopt (handle, CURLOPT_FRESH_CONNECT, true); 
    //curl_easy_setopt (handle, CURLOPT_VERBOSE, true); 
    curl_easy_setopt (handle, CURLOPT_FOLLOWLOCATION, true); 
    curl_easy_setopt (handle, CURLOPT_WRITEDATA, file); 
    curl_easy_setopt (handle, CURLOPT_TCP_KEEPALIVE, 0L); 

    curl_easy_setopt (handle, CURLOPT_ENCODING , "gzip"); 
    curl_easy_setopt (handle, CURLOPT_PROXY, (const char*) url); 
    curl_easy_setopt (handle, CURLOPT_HTTPHEADER, chunk); 
    curl_easy_setopt (handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36"); 

    CURLcode res = curl_easy_perform (handle); 
    if (res != CURLE_OK){ 
     printf ("Proxy %s failed with: %d (%s)\n", (const char*) url, res, curl_easy_strerror (res)); 
    } else { 
     long http_code = 0; 
     curl_easy_getinfo (handle, CURLINFO_RESPONSE_CODE, &http_code); 

     printf("Proxy %s finished with code: %d\n", (const char*) url, http_code); 
    } 


    curl_easy_cleanup (handle); 

    return NULL; 
} 

const int NUMT = 21; 

const char * urls[] = { 
     "69.12.64.105:8089", 
     "69.12.64.105:7808", 
     "210.245.20.170:80", 
     "190.74.165.109:8080", 
     "39.184.2.111:8123", 
     "190.201.166.37:8080", 
     "190.36.85.199:8080", 
     "92.255.231.54:8080", 
     "124.126.126.101:80", 
     "43.250.255.65:8080", 
     "69.12.64.106:7808", 
     "201.217.213.166:8080", 
     "178.169.90.188:8888", 
     "124.248.205.25:8080", 
     "39.190.82.133:8123", 
     "190.77.230.36:8080", 
     "201.243.204.230:8080", 
     "190.201.58.26:8080", 
     "178.166.155.36:8080", 
     "183.221.188.66:8123", 
     "207.66.105.37:24040", 
}; 


int main(int argc, char **argv) 
{ 
    pthread_t tid[NUMT]; 
    int i; 
    int error; 
    (void)argc; /* we don't use any arguments in this example */ 
    (void)argv; 

    /* Must initialize libcurl before any threads are started */ 
    curl_global_init(CURL_GLOBAL_ALL); 

    init_locks(); 

    for(i=0; i< NUMT; i++) { 
     error = pthread_create(&tid[i], 
           NULL, /* default attributes please */ 
           pull_one_url, 
           (void *)urls[i]); 
     if(0 != error) 
      fprintf(stderr, "Couldn't run thread number %d, errno %d\n", i, error); 
     else 
      fprintf(stderr, "Thread %d, gets %s\n", i, urls[i]); 
    } 

    /* now wait for all threads to terminate */ 
    for(i=0; i< NUMT; i++) { 
     error = pthread_join(tid[i], NULL); 
     fprintf(stderr, "Thread %d terminated\n", i); 
    } 

    kill_locks(); 

    return 0; 
}