首试就出问题了,寻找不出来数据,
想查一下csdn知识库有多少条目,结果一条也查不出来
from pyspider.libs.base_handler import * import logging class Handler(BaseHandler): def on_start(self): self.crawl('http://lib.csdn.net/bases', callback=self.index_page) def index_page(self, response): for each in response.doc('a[href^="http://lib.csdn.net/base/"]').items(): logging.debug(each.attr.href) self.crawl(each.attr.href, callback=self.detail_page) for each in response.doc('a[href^="http://lib.csdn.net/bases/fd/all/ast/all/"]').items(): self.crawl(urljoin(each.attr.href,'?=').replace('?=',''), callback=self.in_page) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), }
苦呀,试了各种选择器都不行,
于是在本地建了个文件(一个python,一个txt)试了一下,
咦?打开文件没有首行(<!DOCTYPE html>)就可以,
虽然不知为啥,但是办法有了。
于是就有第二版
from pyspider.libs.base_handler import * from pyquery import PyQuery as pq import logging class Handler(BaseHandler): def on_start(self): self.crawl('http://lib.csdn.net/bases', callback=self.index_page) def index_page(self, response): New_Http = response.text.replace("DOCTYPE","delDOCTYPE") #logging.debug(New_Str) New_response = pq(New_Http) for each in New_response('a[href^="http://lib.csdn.net/base/"]').items(): logging.debug(each.attr.href) self.crawl(each.attr.href, callback=self.detail_page) for each in New_response('a[href^="http://lib.csdn.net/bases/fd/all/ast/all/"]').items(): self.crawl(urljoin(each.attr.href,'?=').replace('?=',''), callback=self.index_page) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), }
欧了!