首试就出问题了,寻找不出来数据,
想查一下csdn知识库有多少条目,结果一条也查不出来
from pyspider.libs.base_handler import *
import logging
class Handler(BaseHandler):
def on_start(self):
self.crawl('http://lib.csdn.net/bases', callback=self.index_page)
def index_page(self, response):
for each in response.doc('a[href^="http://lib.csdn.net/base/"]').items():
logging.debug(each.attr.href)
self.crawl(each.attr.href, callback=self.detail_page)
for each in response.doc('a[href^="http://lib.csdn.net/bases/fd/all/ast/all/"]').items():
self.crawl(urljoin(each.attr.href,'?=').replace('?=',''), callback=self.in_page)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}苦呀,试了各种选择器都不行,
于是在本地建了个文件(一个python,一个txt)试了一下,
咦?打开文件没有首行(<!DOCTYPE html>)就可以,
虽然不知为啥,但是办法有了。
于是就有第二版
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
import logging
class Handler(BaseHandler):
def on_start(self):
self.crawl('http://lib.csdn.net/bases', callback=self.index_page)
def index_page(self, response):
New_Http = response.text.replace("DOCTYPE","delDOCTYPE")
#logging.debug(New_Str)
New_response = pq(New_Http)
for each in New_response('a[href^="http://lib.csdn.net/base/"]').items():
logging.debug(each.attr.href)
self.crawl(each.attr.href, callback=self.detail_page)
for each in New_response('a[href^="http://lib.csdn.net/bases/fd/all/ast/all/"]').items():
self.crawl(urljoin(each.attr.href,'?=').replace('?=',''), callback=self.index_page)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}欧了!