1
我想創建一個簡單的c程序,從網頁中去除HTML並保留文本。到目前爲止,我已經拿出了下面的代碼。它使用cURL來獲取網頁的內容並將其寫入文件。我如何通過內存緩衝區並刪除所有HTML標籤並輸出到文本到終端或文件?從緩衝區讀取C
#include <curl/curl.h>
#include <stdio.h>
#include <stdlib.h>
#define WEBPAGE_URL "http://homepages.paradise.net.nz/adrianfu/index.html"
#define DESTINATION_FILE "/home/acwest/data.txt"
size_t write_data(void *ptr, size_t size, size_t nmeb, void *stream)
{
return fwrite(ptr,size,nmeb,stream);
}
int main()
{
int in_tag = 0;
char * buffer;
char c;
long lSize;
size_t result;
FILE * file = fopen(DESTINATION_FILE,"w+");
if (file==NULL) {
fputs ("File error",stderr);
exit (1);
}
CURL *handle = curl_easy_init();
curl_easy_setopt(handle,CURLOPT_URL,WEBPAGE_URL); /*Using the http protocol*/
curl_easy_setopt(handle,CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(handle,CURLOPT_WRITEDATA, file);
curl_easy_perform(handle);
curl_easy_cleanup(handle);
// obtain file size:
fseek (file, 0, SEEK_END);
lSize = ftell (file);
rewind (file);
// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*lSize);
if (buffer == NULL) {
fputs ("Memory error",stderr);
exit (2);
}
// copy the file into the buffer:
result = fread (buffer,1,lSize,file);
if (result != lSize) {
fputs ("Reading error",stderr);
exit (3);
}
}
您可以使用像http://expat.sourceforge.net/ – Cyclonecode 2012-02-25 10:53:49
這樣的現有解析庫只是一句話:您試圖實現的目標將接近bash腳本中的一行,使用curl和sed。 – qdii 2012-02-25 10:54:58
@ user667430:你的代碼甚至沒有編譯... – qdii 2012-02-25 10:58:58