css
在shell底下调试比较方便
scrapy shell http://blog.jobbole.com/112127/
先把所有的代码贴出来
# -*- coding: utf-8 -*-
import scrapy
import re
import datetime
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/category/php-programmer/']
def parse(self, response):
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
#extract()之后变成一个数组,就无法二次操作
# post_urls = response.css("#archive .floated-thumb .post-thumb").extract()
for post_node in post_nodes:
post_url = post_node.css("::attr(href)").extract_first("")
img_url = post_node.css("img::attr(src)").extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), meta={"front_img_url": img_url}, callback=self.parse_detail)
next_url = response.css(".next.page-numbers::attr(href)").extract_first()
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
article_item = JobBoleArticleItem()
# title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
# time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","")
# praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
title = response.css(".entry-header h1::text").extract()[0]
front_img_url = response.meta.get("front_img_url", "")
url = response.url
create_time = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "")
praise_nums = response.css(".vote-post-up h10::text").extract_first("0")
fav_nums = response.css(".bookmark-btn::text").extract_first("0")
match_nums = re.match(".*(\d+).*", fav_nums)
if(match_nums):
fav_nums = match_nums.group(1)
else:
fav_nums = 0
content = response.css("div.entry").extract()[0]
# for i, p in enumerate(content):
# print(i, p)
article_item["title"] = title
article_item["front_img_url"] = front_img_url
article_item["praise_nums"] = praise_nums
article_item["fav_nums"] = fav_nums
try:
create_time = datetime.datetime.strftime(create_time, "%Y%m%d").date()
except Exception as e:
create_time = datetime.datetime.now().date()
article_item["create_time"] = create_time
article_item["url"] = url
article_item["content"] = content
#用过item Loader加载
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
item_loader.add_value("front_img_url", [front_img_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("content", "div.entry")
article_item = item_loader.load_item()
yield article_item
item_loader
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
import datetime
import re
from scrapy.loader.processors import MapCompose,TakeFirst
from scrapy.loader import ItemLoader
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
def date_convert(value):
try:
create_time = datetime.datetime.strftime(value, "%Y%m%d").date()
except Exception as e:
create_time = datetime.datetime.now().date()
return create_time
def get_num_value(value):
match_nums = re.match(".*(\d+).*", value)
if (match_nums):
nums = match_nums.group(1)
else:
nums = 0
return nums
class ArticleItemLoader(ItemLoader):
default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
# front_img_path = scrapy.Field()
url = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_num_value),
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_num_value),
)
content = scrapy.Field()
create_time = scrapy.Field(
input_processor = MapCompose(date_convert),
)
front_img_url = scrapy.Field()
Item Loaders 提供了一种简便的构件(mechanism)来抓取:ref:Items. 虽然Items可以从它自己的类似字典(dictionary-like)的API得到所需信息 ,不过 Item Loaders提供了许多更加方便的API,这些API通过自动完成那些具有共通性的任务,可从抓取进程中得到这些信息, 比如预先解析提取到的原生数据。 换句话来解释, Items 提供了盛装抓取到的数据的容器 , 而Item Loaders提供了构件装载populating该容器。
item_loader的例子
#用过item Loader加载
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
item_loader.add_value("front_img_url", [front_img_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("content", "div.entry")
article_item = item_loader.load_item()