2016-01-19 33 views
0

我正試圖從維基百科下載數據。我發送一個GET請求,但返回狀態 - 「HTTP 301重定向」。現在我需要解析它以獲取目標網址,並從那裏訪問文章內容。 我做了解析,但我無法訪問文章內容。如果您可以幫助我查看文章內容,我感謝您的幫助。從維基百科頁面下載數據

碼 -

#include <winsock2.h> 
#include <WS2tcpip.h> 
#include <windows.h> 
#include <iostream> 
#include <vector> 
#include <ostream> 

using namespace std; 

int main(){ 

string query = "Google"; 
//int depth = 0; 

// Initialize Dependencies to the Windows Socket. 
WSADATA wsaData; 
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { 
    cout << "WSAStartup failed.\n"; 
    system("pause"); 
} 

struct addrinfo hints; 
ZeroMemory(&hints, sizeof(hints)); 
hints.ai_family = AF_INET; 
hints.ai_protocol = IPPROTO_TCP; 
hints.ai_socktype = SOCK_STREAM; 

/* connect to wikipedia //and download the article */ 
static const char wiki_host[] = "en.wikipedia.org"; 
struct addrinfo* targetAdressInfo = NULL; 
DWORD getAddrRes = getaddrinfo(wiki_host, NULL, &hints, &targetAdressInfo); 
if (getAddrRes != 0 || targetAdressInfo == NULL) 
{ 
    cout << "Could not resolve the Host Name" << endl; 
    system("pause"); 
    WSACleanup(); 
    return -1; 
} 

SOCKADDR_IN sockAddr; 
sockAddr.sin_addr = ((struct sockaddr_in*) targetAdressInfo->ai_addr)->sin_addr; 
sockAddr.sin_family = AF_INET; 
sockAddr.sin_port = htons(80); 

freeaddrinfo(targetAdressInfo); 

SOCKET webSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); 
if (webSocket == INVALID_SOCKET) 
{ 
    cout << "Creation of the Socket Failed" << endl; 
    system("pause"); 
    WSACleanup(); 
    return -1; 
} 

if (connect(webSocket, (SOCKADDR*)&sockAddr, sizeof(sockAddr)) != 0) 
{ 
    cout << "Could not connect"; 
    system("pause"); 
    closesocket(webSocket); 
    WSACleanup(); 
    return -1; 
} 

string http_query = "GET /wiki/" + query + " HTTP/1.1\r\n"; 
http_query += string("Host: ") + wiki_host + "\r\n"; 
http_query += "\r\n\r\n"; 
if (send(webSocket, http_query.c_str(), http_query.length(), 0) == -1) { 
    cout << "Could not send the request to the Server" << endl; 
    system("pause"); 
    closesocket(webSocket); 
    WSACleanup(); 
    return -1; 
} 

/* prepare to fetch the wiki article */ 

string response = ""; 

while (true) { 
    static char recv_buffer[4096]; 
    const int bytes_read = recv(webSocket, recv_buffer, sizeof(recv_buffer) - 1, 0); 
    if (!bytes_read) { 
     break; 
    } 
    if (bytes_read == -1) { 
     closesocket(webSocket); 
     WSACleanup(); 
    } 
    recv_buffer[bytes_read] = '\0'; 
    response += recv_buffer; 
}; 

/* finished with the socket */ 

closesocket(webSocket); 
WSACleanup(); 

/* parse the http response headers */ 

size_t cursor = 0; 
string response_content; 
vector<std::string> response_headers; 
const size_t headers_end = response.find("\r\n\r\n"); 

while (true) { 
    const size_t line_end = response.find("\r\n", cursor); 
    if (line_end == std::string::npos) { /* probably due to http error */ 
     break; 
    } 
    response_headers.push_back(response.substr(cursor, line_end - cursor)); 
    if (line_end == headers_end) { /* found content */ 
     response_content = response.substr(headers_end + 4); /* skip \r\n\r\n */ 
     break; 
    } 
    cursor = line_end + 2; /* skip \r\n */ 
} 


// print the respone 
for (int i = 0; i < sizeof(response); i++){ 
    cout << response[i]; 
} 

/* print response_content not work 
for (int i = 0; i < sizeof(response); i++){ 
    cout << response_content[i]; 
}*/ 
system("pause"); 
return 0; 
} 
+0

不要嘗試自己實現HTTP,要麼使用[Windows WinInet庫](https://msdn.microsoft.com/en-us/library/windows/desktop/aa383630%28v=vs.85%29 .aspx)或[cURL庫](http://curl.haxx.se/libcurl/)。從網絡服務器獲取「頁面」時,它將使得生活更加簡單。 –

+0

我建議你使用像curl這樣的高級HTTP客戶端庫,而不是手寫一個。這更容易,更不容易出錯。除了你想學習手寫一個。 – leemes

+0

優秀的API,但我學會手寫(: –

回答

0

我不知道你說「我做了解析,但,但我不能訪問文章內容」的意思。當您獲得301狀態時,您需要執行的操作是查看您收到的「位置」標題。該標題包含您可以用來獲取您正在查找的內容的網址。