fromcoolscrapy.itemsimportHuxiuItemimportscrapyfromscrapy.spidersimportCrawlSpider,Rulefromscrapy.linkextractorsimportLinkExtractorclassLinkSpider(CrawlSpider):name="link"allowed_domains=["huxiu.com"]start_urls=["http://www.huxiu.com/index.php"]rules=(# 提取匹配正则式'/group?f=index_group'链接 (但是不能匹配'deny.php')# 并且会递归爬取(如果没有定义callback,默认follow=True).Rule(LinkExtractor(allow=('/group?f=index_group',),deny=('deny\.php',))),# 提取匹配'/article/\d+/\d+.html'的链接,并使用parse_item来解析它们下载后的内容,不递归Rule(LinkExtractor(allow=('/article/\d+/\d+\.html',)),callback='parse_item'),)defparse_item(self,response):self.logger.info('Hi, this is an item page! %s',response.url)detail=response.xpath('//div[@class="article-wrap"]')item=HuxiuItem()item['title']=detail.xpath('h1/text()')[0].extract()item['link']=response.urlitem['posttime']=detail.xpath('div[@class="article-author"]/span[@class="article-time"]/text()')[0].extract()print(item['title'],item['link'],item['posttime'])yielditem
fromcoolscrapy.itemsimportBlogItemimportscrapyfromscrapy.spidersimportXMLFeedSpiderclassXMLSpider(XMLFeedSpider):name="xml"namespaces=[('atom','http://www.w3.org/2005/Atom')]allowed_domains=["github.io"]start_urls=["http://www.pycoding.com/atom.xml"]iterator='xml'# 缺省的iternodes,貌似对于有namespace的xml不行itertag='atom:entry'defparse_node(self,response,node):# self.logger.info('Hi, this is a <%s> node!', self.itertag)item=BlogItem()item['title']=node.xpath('atom:title/text()')[0].extract()item['link']=node.xpath('atom:link/@href')[0].extract()item['id']=node.xpath('atom:id/text()')[0].extract()item['published']=node.xpath('atom:published/text()')[0].extract()item['updated']=node.xpath('atom:updated/text()')[0].extract()self.logger.info('|'.join([item['title'],item['link'],item['id'],item['published']]))returnitem
fromcoolscrapy.itemsimportBlogItemfromscrapy.spidersimportCSVFeedSpiderclassCSVSpider(CSVFeedSpider):name="csv"allowed_domains=['example.com']start_urls=['http://www.example.com/feed.csv']delimiter=';'quotechar="'"headers=['id','name','description']defparse_row(self,response,row):self.logger.info('Hi, this is a row!: %r',row)item=BlogItem()item['id']=row['id']item['name']=row['name']returnitem