0
這是我的網頁scrapy的簡單結構。如何包裝在scrapy中創建start_urls的過程?
import scrapy,urllib.request
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
name = "quotes"
allowed_domains = ["finance.yahoo.com"]
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item:namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol :
s3.append(symbol)
start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s2]
def parse(self, response):
content = response.body
target = response.url
#doing somthing ,omitted code
將其保存爲test.py並與scrapy runspider test.py
運行它。
現在我想包裝所有創建start_urls的代碼。
我在這裏試試。
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item : namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol : s3.append(symbol)
self.start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s3]
它不能工作。