這裏是我的代碼:
start_spider.sh
#!/bin/bash
# list letter
parseLetter=('a' 'b')
# change path
cd $path/scrapy/scrapyTodo/scrapyTodo
tLen=${#parseLetter[@]}
for ((i=0; i<${tLen}; i++));
do
scrapy crawl root -a alpha=${parseLetter[$i]} &
done
這裏是我的scrapy代碼:
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from elasticsearch import Elasticsearch
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from urlparse import urljoin
from bs4 import BeautifulSoup
from scrapy.spider import BaseSpider
from bs4 import BeautifulSoup
from tools import sendEmail
from tools import ElasticAction
from tools import runlog
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from datetime import datetime
import re
class studentCrawler(BaseSpider):
# Crawling Start
CrawlSpider.started_on = datetime.now()
name = "root"
DOWNLOAD_DELAY = 0
allowed_domains = ['website.com']
ES_Index = "website"
ES_Type = "root"
ES_Ip = "127.0.0.1"
child_type = "level1"
handle_httpstatus_list = [404, 302, 503, 999, 200] #add any other code you need
es = ElasticAction(ES_Index, ES_Type, ES_Ip)
# Init
def __init__(self, alpha=''):
base_domain = 'https://www.website.com/directory/student-' + str(alpha) + "/"
self.start_urls = [base_domain]
super(CompanyCrawler, self).__init__(self.start_urls)
def is_empty(self, any_structure):
"""
Function that allow to check if the data is empty or not
:arg any_structure: any data
"""
if any_structure:
return 1
else:
return 0
def parse(self, response):
"""
main method that parse the web page
:param response:
:return:
"""
if response.status == 404:
self.es.insertIntoES(response.url, "False")
if str(response.status) == "503":
self.es.insertIntoES(response.url, "False")
if response.status == 999:
self.es.insertIntoES(response.url, "False")
if str(response.status) == "200":
# Selector
sel = Selector(response)
self.es.insertIntoES(response.url, "True")
body = self.getAllTheUrl('u'.join(sel.xpath(".//*[@id='seo-dir']/div/div[3]").extract()).strip(),response.url)
def getAllTheUrl(self, data, parent_id):
dictCompany = dict()
soup = BeautifulSoup(data,'html.parser')
for a in soup.find_all('a', href=True):
self.es.insertChildAndParent(self.child_type, str(a['href']), "False", parent_id)
我發現BeautifulSoup不工作當蜘蛛被監督者啓動。 ...
什麼start_spider.sh看起來不一樣? – Michael