scapy鏁欑▼鍒濇帴瑙(3)鈥攛path璇硶鍜宑ss璇硶

css
鍦╯hell搴曚笅璋冭瘯姣旇緝鏂逛究

scrapy shell http://blog.jobbole.com/112127/

鍏堟妸鎵鏈夌殑浠g爜璐村嚭鏉

# -*- coding: utf-8 -*-
import scrapy
import re
import datetime
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/category/php-programmer/']

    def parse(self, response):
        post_nodes = response.css("#archive .floated-thumb .post-thumb a")
        #extract()涔嬪悗鍙樻垚涓涓暟缁勶紝灏辨棤娉曚簩娆℃搷浣
        # post_urls = response.css("#archive .floated-thumb .post-thumb").extract()
        for post_node in post_nodes:
            post_url = post_node.css("::attr(href)").extract_first("")
            img_url = post_node.css("img::attr(src)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url), meta={"front_img_url": img_url}, callback=self.parse_detail)
        next_url = response.css(".next.page-numbers::attr(href)").extract_first()
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self,  response):
        article_item = JobBoleArticleItem()
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
        # time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("路","")
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        title = response.css(".entry-header h1::text").extract()[0]
        front_img_url = response.meta.get("front_img_url", "")
        url = response.url
        create_time = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace("路", "")
        praise_nums = response.css(".vote-post-up h10::text").extract_first("0")
        fav_nums = response.css(".bookmark-btn::text").extract_first("0")
        match_nums = re.match(".*(\d+).*", fav_nums)
        if(match_nums):
             fav_nums = match_nums.group(1)
        else:
            fav_nums = 0
        content = response.css("div.entry").extract()[0]
        # for i, p in enumerate(content):
        #     print(i, p)
        article_item["title"] = title
        article_item["front_img_url"] = front_img_url
        article_item["praise_nums"] = praise_nums
        article_item["fav_nums"] = fav_nums
        try:
            create_time = datetime.datetime.strftime(create_time, "%Y%m%d").date()
        except Exception as e:
            create_time = datetime.datetime.now().date()
        article_item["create_time"] = create_time
        article_item["url"] = url
        article_item["content"] = content

        #鐢ㄨ繃item Loader鍔犺浇
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_img_url", [front_img_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("content", "div.entry")
        article_item = item_loader.load_item()

        yield article_item

item_loader

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
import datetime
import re

from scrapy.loader.processors import MapCompose,TakeFirst
from scrapy.loader import ItemLoader
class ArticlespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

def date_convert(value):
    try:
        create_time = datetime.datetime.strftime(value, "%Y%m%d").date()
    except Exception as e:
        create_time = datetime.datetime.now().date()
    return create_time
def get_num_value(value):
    match_nums = re.match(".*(\d+).*", value)
    if (match_nums):
        nums = match_nums.group(1)
    else:
        nums = 0
    return nums
class ArticleItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
    title = scrapy.Field()
    # front_img_path = scrapy.Field()
    url = scrapy.Field()
    praise_nums = scrapy.Field(
        input_processor=MapCompose(get_num_value),
    )
    fav_nums = scrapy.Field(
        input_processor=MapCompose(get_num_value),
    )
    content = scrapy.Field()
    create_time = scrapy.Field(
        input_processor = MapCompose(date_convert),
    )
    front_img_url = scrapy.Field()

Item Loaders 鎻愪緵浜嗕竴绉嶇畝渚跨殑鏋勪欢锛坢echanism锛夋潵鎶撳彇:ref:Items. 铏界劧Items鍙互浠庡畠鑷繁鐨勭被浼煎瓧鍏革紙dictionary-like锛夌殑API寰楀埌鎵闇淇℃伅 ,涓嶈繃 Item Loaders鎻愪緵浜嗚澶氭洿鍔犳柟渚跨殑API锛岃繖浜汚PI閫氳繃鑷姩瀹屾垚閭d簺鍏锋湁鍏遍氭х殑浠诲姟锛屽彲浠庢姄鍙栬繘绋嬩腑寰楀埌杩欎簺淇℃伅, 姣斿棰勫厛瑙f瀽鎻愬彇鍒扮殑鍘熺敓鏁版嵁銆 鎹㈠彞璇濇潵瑙i噴, Items 鎻愪緵浜嗙洓瑁呮姄鍙栧埌鐨勬暟鎹殑瀹瑰櫒 , 鑰孖tem Loaders鎻愪緵浜嗘瀯浠瑁呰浇populating璇ュ鍣ㄣ

item_loader鐨勪緥瀛

        #鐢ㄨ繃item Loader鍔犺浇
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("create_time", ".entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_img_url", [front_img_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("content", "div.entry")
        article_item = item_loader.load_item()

All posts

Other pages

鍙戣〃璇勮

鐢靛瓙閭欢鍦板潃涓嶄細琚叕寮銆 蹇呭~椤瑰凡鐢*鏍囨敞