# -*- coding: utf-8 -*-
"""
Created on 2018-07-25 11:49:08
---------
@summary: 请æ±ç»æä½
---------
@author: Boris
@email: [email protected]
"""
import requests
from requests.adapters import HTTPAdapter
from requests.cookies import RequestsCookieJar
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import feapder.setting as setting
import feapder.utils.tools as tools
from feapder.db.redisdb import RedisDB
from feapder.network import user_agent
from feapder.network.proxy_pool import ProxyPool
from feapder.network.response import Response
from feapder.utils.log import log
from feapder.utils.webdriver import WebDriverPool
# å±è½warningä¿¡æ¯
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class Request(object):
session = None
webdriver_pool: WebDriverPool = None
user_agent_pool = user_agent
proxies_pool: ProxyPool = None
cache_db = None # redis / pika
cached_redis_key = None # ç¼åresponseçæä»¶æä»¶å¤¹ response_cached:cached_redis_key:md5
cached_expire_time = 1200 # ç¼åè¿ææ¶é´
local_filepath = None
oss_handler = None
__REQUEST_ATTRS__ = {
# 'method', 'url', å¿
é¡»ä¼ é ä¸å å
¥**kwargsä¸
"params",
"data",
"headers",
"cookies",
"files",
"auth",
"timeout",
"allow_redirects",
"proxies",
"hooks",
"stream",
"verify",
"cert",
"json",
}
DEFAULT_KEY_VALUE = dict(
url="",
retry_times=0,
priority=300,
parser_name=None,
callback=None,
filter_repeat=True,
auto_request=True,
request_sync=False,
use_session=None,
random_user_agent=True,
download_midware=None,
is_abandoned=False,
render=False,
render_time=0,
)
def __init__(
self,
url="",
retry_times=0,
priority=300,
parser_name=None,
callback=None,
filter_repeat=True,
auto_request=True,
request_sync=False,
use_session=None,
random_user_agent=True,
download_midware=None,
is_abandoned=False,
render=False,
render_time=0,
**kwargs,
):
"""
@summary: Requeståæ°
---------
æ¡æ¶åæ°
@param url: å¾
æåurl
@param retry_times: å½åéè¯æ¬¡æ°
@param priority: ä¼å
级 è¶å°è¶ä¼å
é»è®¤300
@param parser_name: åè°å½æ°æå¨çç±»å é»è®¤ä¸ºå½åç±»
@param callback: åè°å½æ° å¯ä»¥æ¯å½æ° ä¹å¯æ¯å½æ°åï¼å¦æ³è·¨ç±»åè°æ¶ï¼parser_nameæå®é£ä¸ªç±»åï¼callbackæå®é£ä¸ªç±»æ³åè°çæ¹æ³åå³å¯ï¼
@param filter_repeat: æ¯å¦éè¦å»é (True/False) å½settingä¸çREQUEST_FILTER_ENABLE设置为Trueæ¶è¯¥åæ°çæ é»è®¤True
@param auto_request: æ¯å¦éè¦èªå¨è¯·æ±ä¸è½½ç½é¡µ é»è®¤æ¯ã设置为Falseæ¶è¿åçresponse为空ï¼éè¦èªå·±å»è¯·æ±ç½é¡µ
@param request_sync: æ¯å¦åæ¥è¯·æ±ä¸è½½ç½é¡µï¼é»è®¤å¼æ¥ãå¦æè¯¥è¯·æ±urlè¿ææ¶é´å¿«ï¼å¯è®¾ç½®ä¸ºTrueï¼ç¸å½äºyieldçreqeustä¼ç«å³ååºï¼è䏿¯å»æé
@param use_session: æ¯å¦ä½¿ç¨sessionæ¹å¼
@param random_user_agent: æ¯å¦éæºUser-Agent (True/False) å½settingä¸çRANDOM_HEADERS设置为Trueæ¶è¯¥åæ°çæ é»è®¤True
@param download_midware: ä¸è½½ä¸é´ä»¶ãé»è®¤ä¸ºparserä¸çdownload_midware
@param is_abandoned: å½åçå¼å¸¸æ¶æ¯å¦æ¾å¼éè¯ True/False. é»è®¤False
@param render: æ¯å¦ç¨æµè§å¨æ¸²æ
@param render_time: æ¸²ææ¶é¿ï¼å³æå¼ç½é¡µçå¾
æå®æ¶é´ååè·åæºç
--
以ä¸åæ°ä¸requestsåæ°ä½¿ç¨æ¹å¼ä¸è´
@param method: è¯·æ±æ¹å¼ï¼å¦POSTæGETï¼é»è®¤æ ¹æ®data弿¯å¦ä¸ºç©ºæ¥å¤æ
@param params: 请æ±åæ°
@param data: 请æ±body
@param json: 请æ±jsonå符串ï¼å json.dumps(data)
@param headers:
@param cookies: åå
¸ æ CookieJar 对象
@param files:
@param auth:
@param timeout: (æµ®ç¹æå
ç»)çå¾
æå¡å¨æ°æ®çè¶
æ¶éå¶ï¼æ¯ä¸ä¸ªæµ®ç¹æ°ï¼ææ¯ä¸ä¸ª(connect timeout, read timeout) å
ç»
@param allow_redirects : Boolean. True 表示å
许è·è¸ª POST/PUT/DELETE æ¹æ³çéå®å
@param proxies: 代ç {"http":"http://xxx", "https":"https://xxx"}
@param verify: 为 True æ¶å°ä¼éªè¯ SSL è¯ä¹¦
@param stream: å¦æä¸º Falseï¼å°ä¼ç«å³ä¸è½½ååºå
容
@param cert:
--
@param **kwargs: å
¶ä»å¼: å¦ Request(item=item) åitemå¯ç´æ¥ç¨ request.item ååº
---------
@result:
"""
self.url = url
self.retry_times = retry_times
self.priority = priority
self.parser_name = parser_name
self.callback = callback
self.filter_repeat = filter_repeat
self.auto_request = auto_request
self.request_sync = request_sync
self.use_session = use_session
self.random_user_agent = random_user_agent
self.download_midware = download_midware
self.is_abandoned = is_abandoned
self.render = render
self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
self.requests_kwargs = {}
for key, value in kwargs.items():
if key in self.__class__.__REQUEST_ATTRS__: # årequestsåæ°
self.requests_kwargs[key] = value
self.__dict__[key] = value
def __repr__(self):
try:
return "".format(self.url)
except:
return "".format(str(self.to_dict)[:40])
def __setattr__(self, key, value):
"""
é对 request.xxx = xxx çå½¢å¼ï¼æ´æ°reqeuståå
é¨åæ°å¼
@param key:
@param value:
@return:
"""
self.__dict__[key] = value
if key in self.__class__.__REQUEST_ATTRS__:
self.requests_kwargs[key] = value
def __lt__(self, other):
return self.priority < other.priority
@property
def _session(self):
use_session = (
setting.USE_SESSION if self.use_session is None else self.use_session
) # self.use_session ä¼å
级é«
if use_session and not self.__class__.session:
self.__class__.session = requests.Session()
# pool_connections â ç¼åç urllib3 è¿æ¥æ± ä¸ªæ° pool_maxsize â è¿æ¥æ± ä¸ä¿åçæå¤§è¿æ¥æ°
http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
# ä»»ä½ä½¿ç¨è¯¥sessionä¼è¯ç HTTP 请æ±ï¼åªè¦å
¶ URL æ¯ä»¥ç»å®çåç¼å¼å¤´ï¼è¯¥ä¼ è¾éé
å¨å°±ä¼è¢«ä½¿ç¨å°ã
self.__class__.session.mount("http", http_adapter)
return self.__class__.session
@property
def _webdriver_pool(self):
if not self.__class__.webdriver_pool:
self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
return self.__class__.webdriver_pool
@property
def _proxies_pool(self):
if not self.__class__.proxies_pool:
self.__class__.proxies_pool = ProxyPool()
return self.__class__.proxies_pool
@property
def to_dict(self):
request_dict = {}
self.callback = (
getattr(self.callback, "__name__")
if callable(self.callback)
else self.callback
)
self.download_midware = (
getattr(self.download_midware, "__name__")
if callable(self.download_midware)
else self.download_midware
)
for key, value in self.__dict__.items():
if (
key in self.__class__.DEFAULT_KEY_VALUE
and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
or key == "requests_kwargs"
):
continue
if key in self.__class__.__REQUEST_ATTRS__:
if not isinstance(
value, (bytes, bool, float, int, str, tuple, list, dict)
):
value = tools.dumps_obj(value)
else:
if not isinstance(value, (bytes, bool, float, int, str)):
value = tools.dumps_obj(value)
request_dict[key] = value
return request_dict
@property
def callback_name(self):
return (
getattr(self.callback, "__name__")
if callable(self.callback)
else self.callback
)
def get_response(self, save_cached=False):
"""
è·å带æselectoråè½çresponse
@param save_cached: ä¿åç¼å æ¹ä¾¿è°è¯æ¶ä¸ç¨æ¯æ¬¡é½éæ°ä¸è½½
@return:
"""
# 设置è¶
æ¶é»è®¤æ¶é´
self.requests_kwargs.setdefault(
"timeout", setting.REQUEST_TIMEOUT
) # connect=22 read=22
# 设置stream
# é»è®¤æ
åµä¸ï¼å½ä½ è¿è¡ç½ç»è¯·æ±åï¼ååºä½ä¼ç«å³è¢«ä¸è½½ãä½ å¯ä»¥éè¿ stream åæ°è¦çè¿ä¸ªè¡ä¸ºï¼æ¨è¿ä¸è½½ååºä½ç´å°è®¿é® Response.content 屿§ãæ¤æ¶ä»
æååºå¤´è¢«ä¸è½½ä¸æ¥äºã缺ç¹ï¼ stream 设为 Trueï¼Requests æ æ³å°è¿æ¥éæ¾åè¿æ¥æ± ï¼é¤éä½ æ¶èäºææçæ°æ®ï¼æè
è°ç¨äº Response.closeã è¿æ ·ä¼å¸¦æ¥è¿æ¥æçä½ä¸çé®é¢ã
self.requests_kwargs.setdefault("stream", True)
# å
³éè¯ä¹¦éªè¯
self.requests_kwargs.setdefault("verify", False)
# è®¾ç½®è¯·æ±æ¹æ³
method = self.__dict__.get("method")
if not method:
if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
method = "POST"
else:
method = "GET"
# éæºuserâagent
headers = self.requests_kwargs.get("headers", {})
if "user-agent" not in headers and "User-Agent" not in headers:
if self.render: # å¦ææ¯æ¸²æé»è®¤ï¼ä¼å
使ç¨WEBDRIVERä¸é
ç½®çua
ua = setting.WEBDRIVER.get(
"user_agent"
) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
else:
ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
if self.random_user_agent and setting.RANDOM_HEADERS:
headers.update({"User-Agent": ua})
self.requests_kwargs.update(headers=headers)
else:
self.requests_kwargs.setdefault(
"headers", {"User-Agent": setting.DEFAULT_USERAGENT}
)
# 代ç
proxies = self.requests_kwargs.get("proxies", -1)
if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
while True:
proxies = self._proxies_pool.get()
if proxies:
self.requests_kwargs.update(proxies=proxies)
break
else:
log.debug("ææ å¯ç¨ä»£ç ...")
log.debug(
"""
-------------- %srequest for ----------------
url = %s
method = %s
body = %s
"""
% (
""
if not self.parser_name
else "%s.%s "
% (
self.parser_name,
(
self.callback
and callable(self.callback)
and getattr(self.callback, "__name__")
or self.callback
)
or "parse",
),
self.url,
method,
self.requests_kwargs,
)
)
# def hooks(response, *args, **kwargs):
# print(response.url)
#
# self.requests_kwargs.update(hooks={'response': hooks})
use_session = (
setting.USE_SESSION if self.use_session is None else self.use_session
) # self.use_session ä¼å
级é«
if self.render:
# 使ç¨requestçuser_agentãcookiesãproxy
user_agent = headers.get("User-Agent") or headers.get("user-agent")
cookies = self.requests_kwargs.get("cookies")
if cookies and isinstance(cookies, RequestsCookieJar):
cookies = cookies.get_dict()
if not cookies:
cookie_str = headers.get("Cookie") or headers.get("cookie")
if cookie_str:
cookies = tools.get_cookies_from_str(cookie_str)
proxy = None
if proxies and proxies != -1:
proxy = proxies.get("http", "").strip("http://") or proxies.get(
"https", ""
).strip("https://")
browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
try:
browser.get(self.url)
if cookies:
browser.cookies = cookies
if self.render_time:
tools.delay_time(self.render_time)
html = browser.page_source
response = Response.from_dict(
{
"url": browser.current_url,
"cookies": browser.cookies,
"_content": html.encode(),
"status_code": 200,
"elapsed": 666,
"headers": {
"User-Agent": browser.execute_script(
"return navigator.userAgent"
),
"Cookie": tools.cookies2str(browser.cookies),
},
}
)
response.browser = browser
except Exception as e:
self._webdriver_pool.remove(browser)
raise e
elif use_session:
response = self._session.request(method, self.url, **self.requests_kwargs)
response = Response(response)
else:
response = requests.request(method, self.url, **self.requests_kwargs)
response = Response(response)
if save_cached:
self.save_cached(response, expire_time=self.__class__.cached_expire_time)
return response
def proxies(self):
"""
Returns: {"https": "https://ip:port", "http": "http://ip:port"}
"""
return self.requests_kwargs.get("proxies")
def proxy(self):
"""
Returns: ip:port
"""
proxies = self.proxies()
if proxies:
return proxies.get("http", "").strip("http://") or proxies.get(
"https", ""
).strip("https://")
def user_agent(self):
headers = self.requests_kwargs.get("headers")
if headers:
return headers.get("user_agent") or headers.get("User-Agent")
@property
def fingerprint(self):
"""
requestå¯ä¸è¡¨è¯
@return:
"""
url = self.__dict__.get("url", "")
# url å½ä¸å
url = tools.canonicalize_url(url)
args = [url]
for arg in ["params", "data", "files", "auth", "cert", "json"]:
if self.requests_kwargs.get(arg):
args.append(self.requests_kwargs.get(arg))
return tools.get_md5(*args)
@property
def _cache_db(self):
if not self.__class__.cache_db:
self.__class__.cache_db = RedisDB() # .from_url(setting.pika_spider_1_uri)
return self.__class__.cache_db
@property
def _cached_redis_key(self):
if self.__class__.cached_redis_key:
return (
f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
)
else:
return f"response_cached:test:{self.fingerprint}"
def save_cached(self, response, expire_time=1200):
"""
使ç¨redisä¿åresponse ç¨äºè°è¯ ä¸ç¨æ¯åé½ä¸è½½
@param response:
@param expire_time: è¿ææ¶é´
@return:
"""
self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
def get_response_from_cached(self, save_cached=True):
"""
ä»ç¼åä¸è·åresponse
注æï¼
屿§å¼ä¸ºç©ºï¼
-raw ï¼ urllib3.response.HTTPResponse
-connectionï¼requests.adapters.HTTPAdapter
-history
屿§å«ä¹æ¹åï¼
- request ç±requests æ¹ä¸ºRequest
@param: save_cached 彿 ç¼å ç´æ¥ä¸è½½ ä¸è½½å®æ¯å¦ä¿åç¼å
@return:
"""
response_dict = self._cache_db.strget(self._cached_redis_key)
if not response_dict:
log.info("æ responseç¼å éæ°ä¸è½½")
response_obj = self.get_response(save_cached=save_cached)
else:
response_dict = eval(response_dict)
response_obj = Response.from_dict(response_dict)
return response_obj
def del_response_cached(self):
self._cache_db.clear(self._cached_redis_key)
@classmethod
def from_dict(cls, request_dict):
for key, value in request_dict.items():
if isinstance(value, bytes): # ååºåå å¦item
request_dict[key] = tools.loads_obj(value)
return cls(**request_dict)
def copy(self):
return self.__class__.from_dict(self.to_dict)