2016-01-19 78 views
0

我試圖從維基百科下載數據。我發送一個GET請求,但返回只包含頁面狀態和一些HTML細節。無法通過Wikipedia API下載數據

我在做什麼錯?

#include <winsock2.h> 
#include <WS2tcpip.h> 
#include <windows.h> 
#include <iostream> 

int main(){ 

WSADATA wsaData; 
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { 
    cout << "WSAStartup failed.\n"; 
    system("pause"); 
    return -1; 
} 

struct addrinfo hints; 
ZeroMemory(&hints, sizeof(hints)); 
hints.ai_family = AF_INET;   
hints.ai_protocol = IPPROTO_TCP;  
hints.ai_socktype = SOCK_STREAM;  


struct addrinfo* targetAdressInfo = NULL; 
DWORD getAddrRes = getaddrinfo("www.wikipedia.org", NULL, &hints, &targetAdressInfo); 
if (getAddrRes != 0 || targetAdressInfo == NULL) 
{ 
    cout << "Could not resolve the Host Name" << endl; 
    system("pause"); 
    WSACleanup(); 
    return -1; 
} 

SOCKADDR_IN sockAddr; 
sockAddr.sin_addr = ((struct sockaddr_in*) targetAdressInfo->ai_addr)->sin_addr; 
sockAddr.sin_family = AF_INET; 
sockAddr.sin_port = htons(80); 

freeaddrinfo(targetAdressInfo); 

SOCKET webSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); 
if (webSocket == INVALID_SOCKET) 
{ 
    cout << "Creation of the Socket Failed" << endl; 
    system("pause"); 
    WSACleanup(); 
    return -1; 
} 

if (connect(webSocket, (SOCKADDR*)&sockAddr, sizeof(sockAddr)) != 0) 
{ 
    cout << "Could not connect"; 
    system("pause"); 
    closesocket(webSocket); 
    WSACleanup(); 
    return -1; 
} 

// Sending a HTTP-GET-Request to the Web Server 
const char* httpRequest = "GET/http://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=Google HTTP/1.1\r\n\r\n"; 
int sentBytes = send(webSocket, httpRequest, strlen(httpRequest), 0); 
if (sentBytes < strlen(httpRequest) || sentBytes == SOCKET_ERROR) 
{ 
    cout << "Could not send the request to the Server" << endl; 
    system("pause"); 
    closesocket(webSocket); 
    WSACleanup(); 
    return -1; 
} 

char buffer[1000000]; 

ZeroMemory(buffer, sizeof(buffer)); 
int dataLen; 
while ((dataLen = recv(webSocket, buffer, sizeof(buffer), 0) > 0)) 
{ 
    int i = 0; 
    while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r') { 
     cout << buffer[i]; 
     i += 1; 
    } 
} 

closesocket(webSocket); 
WSACleanup(); 

system("pause"); 
return 0; 
} 
+0

您的GET請求似乎格式錯誤。我建議使用工具(例如Wireshark)來檢查瀏覽器發送的標題。 –

+0

您必須使用HTTPS。此外,詢問有關錯誤消息的問題,但不說明錯誤消息的含義並不是很有幫助。 – Tgr

回答

0

如果我記得維基百科會發給你一個HTTP 301 redirection回覆。

您需要解析以獲取目標網址,然後才能訪問文章內容。

您可以看到一些示例代碼,用於執行此操作here

+0

非常感謝,現在我會試試 –

+0

我嘗試使用示例來獲得HTTP 301重定向,但是當我在wiki_fetch_raw函數中執行解析任務時,我沒有得到任何頁面的主體? –

+0

您需要在您從第一個HTTP請求獲取的目標URL處執行第二個HTTP請求。請注意,在示例代碼中,wiki_fetch_raw函數是遞歸的,以便處理多個重定向級別。 –

0

如何使用python腳本?

import urllib2 
def get_page(url): 
    request = urllib2.Request(url) 
    request = urllib2.urlopen(request) 
    data = request.read() 
    return data 
url = "http://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=Google HTTP/1.1\r\n\r\n" 
print get_page(url) 
+0

Python是我的BFF,但在項目的其餘部分,我需要使用C++ –