scapy教程初接触(3)—xpath语法和css语法

css
在shell底下调试比较方便

scrapy shell http://blog.jobbole.com/112127/

先把所有的代码贴出来

# -*- coding: utf-8 -*-
import scrapy
import re
import datetime
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/category/php-programmer/']

    def parse(self, response):
        post_nodes = response.css("#archive .floated-thumb .post-thumb a")
        #extract()之后变成一个数组,就无法二次操作
        # post_urls = response.css("#archive .floated-thumb .post-thumb").extract()
        for post_node in post_nodes:
            post_url = post_node.css("::attr(href)").extract_first("")
            img_url = post_node.css("img::attr(src)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url), meta={"front_img_url": img_url}, callback=self.parse_detail)
        next_url = response.css(".next.page-numbers::attr(href)").extract_first()
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self,  response):
        article_item = JobBoleArticleItem()
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
        # time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","")
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        title = response.css(".entry-header h1::text").extract()[0]
        front_img_url = response.meta.get("front_img_url", "")
        url = response.url
        create_time = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "")
        praise_nums = response.css(".vote-post-up h10::text").extract_first("0")
        fav_nums = response.css(".bookmark-btn::text").extract_first("0")
        match_nums = re.match(".*(\d+).*", fav_nums)
        if(match_nums):
             fav_nums = match_nums.group(1)
        else:
            fav_nums = 0
        content = response.css("div.entry").extract()[0]
        # for i, p in enumerate(content):
        #     print(i, p)
        article_item["title"] = title
        article_item["front_img_url"] = front_img_url
        article_item["praise_nums"] = praise_nums
        article_item["fav_nums"] = fav_nums
        try:
            create_time = datetime.datetime.strftime(create_time, "%Y%m%d").date()
        except Exception as e:
            create_time = datetime.datetime.now().date()
        article_item["create_time"] = create_time
        article_item["url"] = url
        article_item["content"] = content

        #用过item Loader加载
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_img_url", [front_img_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("content", "div.entry")
        article_item = item_loader.load_item()

        yield article_item

item_loader

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
import datetime
import re

from scrapy.loader.processors import MapCompose,TakeFirst
from scrapy.loader import ItemLoader
class ArticlespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

def date_convert(value):
    try:
        create_time = datetime.datetime.strftime(value, "%Y%m%d").date()
    except Exception as e:
        create_time = datetime.datetime.now().date()
    return create_time
def get_num_value(value):
    match_nums = re.match(".*(\d+).*", value)
    if (match_nums):
        nums = match_nums.group(1)
    else:
        nums = 0
    return nums
class ArticleItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
    title = scrapy.Field()
    # front_img_path = scrapy.Field()
    url = scrapy.Field()
    praise_nums = scrapy.Field(
        input_processor=MapCompose(get_num_value),
    )
    fav_nums = scrapy.Field(
        input_processor=MapCompose(get_num_value),
    )
    content = scrapy.Field()
    create_time = scrapy.Field(
        input_processor = MapCompose(date_convert),
    )
    front_img_url = scrapy.Field()

Item Loaders 提供了一种简便的构件(mechanism)来抓取:ref:Items. 虽然Items可以从它自己的类似字典(dictionary-like)的API得到所需信息 ,不过 Item Loaders提供了许多更加方便的API,这些API通过自动完成那些具有共通性的任务,可从抓取进程中得到这些信息, 比如预先解析提取到的原生数据。 换句话来解释, Items 提供了盛装抓取到的数据的容器 , 而Item Loaders提供了构件装载populating该容器。

item_loader的例子

        #用过item Loader加载
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_img_url", [front_img_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("content", "div.entry")
        article_item = item_loader.load_item()

All posts

Other pages

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注