0
我已經編寫了一個程序,用於從網絡中刪除一些數據,如下所示。Python Scrapy編碼utf-8
import scrapy
class JPItem(scrapy.Item):
question_content = scrapy.Field()
best_answer = scrapy.Field()
class JPSpider(scrapy.Spider):
name = "jp"
allowed_domains = ['chiebukuro.yahoo.co.jp']
def start_requests(self):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year=2004&month=1&day=1&page=1'
yield scrapy.Request(url, self.parse)
def parse(self, response):
if str(response.css("div.qa-list small::text").extract()) == '條件に一致する質問はみつかりませんでした。':
for y in range (2004,2007):
for m in range (1,13):
for d in range(1,32):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year='+ str(y) + '&month=' + str(m) + '&day=' + str(d) +'&page=1';
yield scrapy.Request(url, self.parse)
else:
for i in range(0,40):
url = response.xpath('//ul[@id="qalst"]/li/dl/dt/a/@href')[i].extract()
yield scrapy.Request(url, self.parse_info)
next_page = response.css("div.qa-list p.flip a.next::attr(href)").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, self.parse)
def parse_info(self, response):
item = JPItem()
item['question_content'] = "\"" + ''.join(response.css("div.mdPstdQstn div.ptsQes p:not([class])").extract() + response.css("div.mdPstdQstn div.ptsQes p.queTxt::text").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","").replace("<br>","") + "\""
item['best_answer'] = "\"" + ''.join(response.css("div.mdPstdBA div.ptsQes p.queTxt::text").extract() + response.css("div.mdPstdBA div.ptsQes p:not([class])").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","") + "\""
yield item
我發現,應該有一個問題,此行
如果str(response.css( 「div.qa列表小::文」)。提取物())== 「條件に一致する質問はみつかりませんでした。」:
,因爲當我運行它無法檢測到這種情況,即使提取的測試應該是平等的規定的程序,它只會跳到Else條件。我試圖使用.encode(「utf-8」),但似乎無法解決問題。任何人都可以幫助提供一些關於這個問題的建議嗎?
非常感謝。
嘗試''如果response.css(「div.qa-list small :: text」)。extract_first()== u'條件一致的質問是否みつかりませんでした。':' –
@paultrmbrth剛剛試過出。有用! –
@paultrmbrth它完美的作品!謝謝! –