2017-07-15 20 views
-4
import requests 
from lxml import html 


SEARCH_URL = "https://www.yellowpages.com/search" 


def crawl(name, state, page=1): 
    params={'search_terms': name, 'geo_location_terms': state, 'page': page} 
    data = requests.get(SEARCH_URL, params=params).text 
    tree = html.fromstring(data) 
    for items in tree.xpath("//div[@class='info']"): 
     name = items.findtext(".//span[@itemprop='name']") 
     address = items.findtext(".//span[@class='street-address']") 
     phone = items.findtext(".//div[@itemprop='telephone']") 
     showing = items.findtext("//*[@id='main-content']/div[2]/div[4]/p/text()") 


     yield (name, address, phone, showing) 


def search(name, state, pages=1): 
    page = 1 
    while page is not pages: 
     for result in crawl(name, state, page=page): 
      print result 
     page +=1 


if __name__ == '__main__': 
    search('pizza', 'tx', pages=10) 

回溯:如何解決語法錯誤:元素不能使用絕對路徑

Traceback (most recent call last): 
    File "C:/Python27/Scripts/yellowpages.py", line 31, in <module> 
    search('pizza', 'tx', pages=10) 
    File "C:/Python27/Scripts/yellowpages.py", line 25, in search 
    for result in crawl(name, state, page=page): 
    File "C:/Python27/Scripts/yellowpages.py", line 16, in crawl 
    showing = items.findtext("//*[@id='main-content']/div[2]/div[4]/p/text()") 
    File "src\lxml\lxml.etree.pyx", line 1550, in lxml.etree._Element.findtext (src\lxml\lxml.etree.c:59189) 
    File "C:\Python27\lib\site-packages\lxml\_elementpath.py", line 320, in findtext 
    el = find(elem, path, namespaces) 
    File "C:\Python27\lib\site-packages\lxml\_elementpath.py", line 302, in find 
    it = iterfind(elem, path, namespaces) 
    File "C:\Python27\lib\site-packages\lxml\_elementpath.py", line 291, in iterfind 
    selector = _build_path_iterator(path, namespaces) 
    File "C:\Python27\lib\site-packages\lxml\_elementpath.py", line 260, in _build_path_iterator 
    raise SyntaxError("cannot use absolute path on element") 
SyntaxError: cannot use absolute path on element 
+0

請問您可以分享SEARCH_URL嗎? –

+1

如果是語法錯誤,它與xpath無關。提供完整的錯誤追蹤,以便有人可以提供幫助。 – Rahul

回答

0

的問題是在這條線:

showing = items.findtext("//*[@id='main-content']/div[2]/div[4]/p/text()") 

更改crawl功能:

def crawl(name, state, page=1): 
    params={'search_terms': name, 'geo_location_terms': state, 'page': page} 
    data = requests.get(SEARCH_URL, params=params).text 
    tree = html.fromstring(data) 
    for items in tree.xpath("//div[@class='info']"): 
     name = items.findtext(".//span[@itemprop='name']") 
     address = items.findtext(".//span[@class='street-address']") 
     phone = items.findtext(".//div[@itemprop='telephone']") 
     showing = tree.xpath(".//div[@class='pagination']/p/text()")[0] 

     yield (name, address, phone,showing) 

它會產生re sult:

(None, None, None, '1-30\nof 3030') 
('Port "A" Pizzeria', '407 E Avenue G', '(361) 749-5226', '1-30\nof 3030') 
("Palio's Pizza Cafe", '3492 Legacy Dr', '(214) 308-6895', '1-30\nof 3030') 
('Pizza Inn', '1501 Magnolia Ave', '(409) 242-2870', '1-30\nof 3030') 
("Papa Murphy's Take & Bake Pizza", '815 SW Alsbury Blvd', '(817) 447-6777', '1-30\nof 3030') 
("Lane's", '630 Sabine St', '(409) 787-3838', '1-30\nof 3030') 
("Little Ceasar's Pizza", '1000 N Midkiff Rd', '(432) 694-3676', '1-30\nof 3030') 
('The Gaff', '323 Beach Ave', '(361) 749-5970', '1-30\nof 3030') 
("CiCi's Pizza", '1440 N Highway 77', '(972) 937-1222', '1-30\nof 3030') 
...... 
相關問題