importscrapyfromscrapy.crawlerimportCrawlerProcessfromscrapy.utils.projectimportget_project_settingsprocess=CrawlerProcess(get_project_settings())process.crawl(MySpider)process.start()# the script will block here until the crawling is finished
然后你就可以直接执行这个脚本
1
python run.py
另外一个功能更强大的类是scrapy.crawler.CrawlerRunner,推荐你使用这个
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
fromtwisted.internetimportreactorimportscrapyfromscrapy.crawlerimportCrawlerRunnerfromscrapy.utils.logimportconfigure_loggingclassMySpider(scrapy.Spider):# Your spider definition...configure_logging({'LOG_FORMAT':'%(levelname)s: %(message)s'})runner=CrawlerRunner()d=runner.crawl(MySpider)d.addBoth(lambda_:reactor.stop())reactor.run()# the script will block here until the crawling is finished
importscrapyfromtwisted.internetimportreactorfromscrapy.crawlerimportCrawlerRunnerfromscrapy.utils.logimportconfigure_loggingclassMySpider1(scrapy.Spider):# Your first spider definition...classMySpider2(scrapy.Spider):# Your second spider definition...configure_logging()runner=CrawlerRunner()runner.crawl(MySpider1)runner.crawl(MySpider2)d=runner.join()d.addBoth(lambda_:reactor.stop())reactor.run()# the script will block here until all crawling jobs are finished
fromcoolscrapy.utilsimportparse_textfromscrapy.spidersimportCrawlSpider,Rulefromscrapy.linkextractorsimportLinkExtractorfromcoolscrapy.itemsimportArticleclassArticleSpider(CrawlSpider):name="article"def__init__(self,rule):self.rule=ruleself.name=rule.nameself.allowed_domains=rule.allow_domains.split(",")self.start_urls=rule.start_urls.split(",")rule_list=[]# 添加`下一页`的规则ifrule.next_page:rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page)))# 添加抽取文章链接的规则rule_list.append(Rule(LinkExtractor(allow=[rule.allow_url],restrict_xpaths=[rule.extract_from]),callback='parse_item'))self.rules=tuple(rule_list)super(ArticleSpider,self).__init__()defparse_item(self,response):self.log('Hi, this is an article page! %s'%response.url)article=Article()article["url"]=response.urltitle=response.xpath(self.rule.title_xpath).extract()article["title"]=parse_text(title,self.rule.name,'title')body=response.xpath(self.rule.body_xpath).extract()article["body"]=parse_text(body,self.rule.name,'body')publish_time=response.xpath(self.rule.publish_time_xpath).extract()article["publish_time"]=parse_text(publish_time,self.rule.name,'publish_time')article["source_site"]=self.rule.source_sitereturnarticle
@contextmanagerdefsession_scope(Session):"""Provide a transactional scope around a series of operations."""session=Session()try:yieldsessionsession.commit()except:session.rollback()raisefinally:session.close()classArticleDataBasePipeline(object):"""保存文章到数据库"""def__init__(self):engine=db_connect()create_news_table(engine)self.Session=sessionmaker(bind=engine)defopen_spider(self,spider):"""This method is called when the spider is opened."""passdefprocess_item(self,item,spider):a=Article(url=item["url"],title=item["title"].encode("utf-8"),publish_time=item["publish_time"].encode("utf-8"),body=item["body"].encode("utf-8"),source_site=item["source_site"].encode("utf-8"))withsession_scope(self.Session)assession:session.add(a)defclose_spider(self,spider):pass
importloggingfromspiders.article_spiderimportArticleSpiderfromtwisted.internetimportreactorfromscrapy.crawlerimportCrawlerRunnerfromscrapy.utils.projectimportget_project_settingsfromscrapy.utils.logimportconfigure_loggingfromcoolscrapy.modelsimportdb_connectfromcoolscrapy.modelsimportArticleRulefromsqlalchemy.ormimportsessionmakerif__name__=='__main__':settings=get_project_settings()configure_logging(settings)db=db_connect()Session=sessionmaker(bind=db)session=Session()rules=session.query(ArticleRule).filter(ArticleRule.enable==1).all()session.close()runner=CrawlerRunner(settings)forruleinrules:# stop reactor when spider closes# runner.signals.connect(spider_closing, signal=signals.spider_closed)runner.crawl(ArticleSpider,rule=rule)d=runner.join()d.addBoth(lambda_:reactor.stop())# blocks process so always keep as the last statementreactor.run()logging.info('all finished.')