2015-10-01 70 views
5

我用下面的代碼:如何獲得谷歌的搜索結果

library(XML) 
library(RCurl) 
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE) 
    { 
    search.term <- gsub(' ', '%20', search.term) 
    if(quotes) search.term <- paste('%22', search.term, '%22', sep='') 
     getGoogleURL <- paste('http://www.google', domain, '/search?q=', 
     search.term, sep='') 
    } 

    getGoogleLinks <- function(google.url) 
    { 
     doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)")) 
     html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){}) 
     nodes <- getNodeSet(html, "//a[@href][@class='l']") 
     return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]])) 
    } 

search.term <- "cran" 
quotes <- "FALSE" 
search.url <- getGoogleURL(search.term=search.term, quotes=quotes) 

links <- getGoogleLinks(search.url) 

我想找到所有源於我的搜索鏈接,我得到以下結果:

> links 
list() 

如何我可以獲得鏈接嗎? 此外,我想獲得谷歌結果的頭條新聞和摘要,我怎樣才能得到它? 終於有辦法獲得ChillingEffects.org結果中的鏈接了嗎?

+1

http://stackoverflow.com/a/22703153/1457051 – hrbrmstr

回答

6

如果您查看html變量,您可以看到搜索結果鏈接全部嵌套在<h3 class="r">標記中。

請儘量把getGoogleLinks功能更改爲:

getGoogleLinks <- function(google.url) { 
    doc <- getURL(google.url, httpheader = c("User-Agent" = "R 
              (2.10.0)")) 
    html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function 
          (...){}) 
    nodes <- getNodeSet(html, "//h3[@class='r']//a") 
    return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]])) 
} 
3

我創造了這個功能在公司名稱的列表讀取,然後得到每個頂級網站的結果。它會讓你開始,然後你可以根據需要進行調整。

#libraries. 
library(URLencode) 
library(rvest) 

#load data 
d <-read.csv("P:\\needWebsites.csv") 
c <- as.character(d$Company.Name) 

# Function for getting website. 
getWebsite <- function(name) 
{ 
    url = URLencode(paste0("https://www.google.com/search?q=",name)) 

    page <- read_html(url) 

    results <- page %>% 
     html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types. 
     html_text() 

    result <- results[1] 

    return(as.character(result)) # Return results if you want to see them all. 
} 

# Apply the function to a list of company names. 
websites <- data.frame(Website = sapply(c,getWebsite))]