scrapy-redis 瀹炵幇鍒嗗竷寮忕埇铏
scrapy-redis 鏋舵瀯
1锛屽畨瑁卻crapy-redis
pip install scrapy-redis
2锛屽惎鐢╯ettings.py閲岄潰鐨勭粍浠
娉ㄦ剰settings閲岄潰鐨勪腑鏂囨敞閲婁細鎶ラ敊锛屾崲鎴愯嫳鏂
# 鍚敤鍦╮edis涓皟搴﹀瓨鍌ㄨ姹傞槦鍒椼
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 涓嶈娓呯悊redis闃熷垪锛屽厑璁告殏鍋/鎭㈠鐖綉銆
SCHEDULER_PERSIST = True
# 鎸囧畾鎺掑簭鐖彇鍦板潃鏃朵娇鐢ㄧ殑闃熷垪锛岄粯璁ゆ槸鎸夌収浼樺厛绾ф帓搴
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
# 鍙夌殑鍏堣繘鍏堝嚭鎺掑簭
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
# 鍙夌殑鍚庤繘鍏堝嚭鎺掑簭
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'
# 鍙湪浣跨敤SpiderQueue鎴栬匰piderStack鏄湁鏁堢殑鍙傛暟,锛屾寚瀹氱埇铏叧闂殑鏈澶х┖闂叉椂闂
SCHEDULER_IDLE_BEFORE_CLOSE = 10
# 鎸囧畾RedisPipeline鐢ㄤ互鍦╮edis涓繚瀛榠tem
ITEM_PIPELINES = {
'example.pipelines.ExamplePipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400
}
# 鎸囧畾redis鐨勮繛鎺ュ弬鏁
# REDIS_PASS鏄垜鑷繁鍔犱笂鐨剅edis杩炴帴瀵嗙爜锛岄渶瑕佺畝鍗曚慨鏀箂crapy-redis鐨勬簮浠g爜浠ユ敮鎸佷娇鐢ㄥ瘑鐮佽繛鎺edis
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
# Custom redis client parameters (i.e.: socket timeout, etc.)
REDIS_PARAMS = {}
#REDIS_URL = 'redis://user:pass@hostname:9001'
#REDIS_PARAMS['password'] = 'itcast.cn'
LOG_LEVEL = 'DEBUG'
DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
#The class used to detect and filter duplicate requests.
#The default (RFPDupeFilter) filters based on request fingerprint using the scrapy.utils.request.request_fingerprint function. In order to change the way duplicates are checked you could subclass RFPDupeFilter and override its request_fingerprint method. This method should accept scrapy Request object and return its fingerprint (a string).
#By default, RFPDupeFilter only logs the first duplicate request. Setting DUPEFILTER_DEBUG to True will make it log all duplicate requests.
DUPEFILTER_DEBUG =True
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, sdch',
}
浠g悊鐨勪腑闂翠欢
class ProxyMiddleware(object):
def __init__(self, settings):
self.queue = 'Proxy:queue'
# 鍒濆鍖栦唬鐞嗗垪琛
# self.r = redis.Redis(host=settings.get('REDIS_HOST'),port=settings.get('REDIS_PORT'),db=1,password=settings.get('REDIS_PARAMS')['password'])
self.r = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=1)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_request(self, request, spider):
proxy={}
source, data = self.r.blpop(self.queue)
proxy['ip_port']=data
proxy['user_pass']=None
if proxy['user_pass'] is not None:
#request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"
request.meta['proxy'] = "http://%s" % proxy['ip_port']
#proxy_user_pass = "USERNAME:PASSWORD"
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
print("********ProxyMiddleware have pass*****" + proxy['ip_port'])
else:
#ProxyMiddleware no pass
print(request.url, proxy['ip_port'])
request.meta['proxy'] = "http://%s" % proxy['ip_port']
def process_response(self, request, response, spider):
"""
妫鏌esponse.status, 鏍规嵁status鏄惁鍦ㄥ厑璁哥殑鐘舵佺爜涓喅瀹氭槸鍚﹀垏鎹㈠埌涓嬩竴涓猵roxy, 鎴栬呯鐢╬roxy
"""
print("-------%s %s %s------" % (request.meta["proxy"], response.status, request.url))
# status涓嶆槸姝e父鐨200鑰屼笖涓嶅湪spider澹版槑鐨勬甯哥埇鍙栬繃绋嬩腑鍙兘鍑虹幇鐨
# status鍒楄〃涓, 鍒欒涓轰唬鐞嗘棤鏁, 鍒囨崲浠g悊
if response.status == 200:
print('rpush',request.meta["proxy"])
self.r.rpush(self.queue, request.meta["proxy"].replace('http://',''))
return response
def process_exception(self, request, exception, spider):
"""
澶勭悊鐢变簬浣跨敤浠g悊瀵艰嚧鐨勮繛鎺ュ紓甯
"""
proxy={}
source, data = self.r.blpop(self.queue)
proxy['ip_port']=data
proxy['user_pass']=None
request.meta['proxy'] = "http://%s" % proxy['ip_port']
new_request = request.copy()
new_request.dont_filter = True
return new_request
All posts
Other pages
Deprecated: 鏂囦欢 娌℃湁 comments.php 鐨勪富棰 鑷増鏈 3.0.0 璧峰凡寮冪敤锛屼笖娌℃湁鍙敤鐨勬浛浠c 璇峰湪鎮ㄧ殑涓婚涓寘鍚竴涓 comments.php 妯℃澘銆 in /www/wwwroot/liguoqi.site/wp-includes/functions.php on line 6078
鍙戣〃鍥炲