# _*_ coding: utf-8 _*_ """ python_spider.py by xianhu """ import urllib.error import urllib.parse import urllib.request import http.cookiejar # é¦å å®ä¹ä¸è¾¹å¯è½éè¦çåé url = "https://www.baidu.com" headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} # æç®åçç½é¡µæåæ¹å¼ response = urllib.request.urlopen(url, timeout=10) html = response.read().decode("utf-8") # 使ç¨Requestå®ä¾ä»£æ¿url request = urllib.request.Request(url, data=None, headers={}) response = urllib.request.urlopen(request, timeout=10) # åéæ°æ®ï¼å³å¨Request()䏿·»å dataåæ° data = urllib.parse.urlencode({"act": "login", "email": "[email protected]", "password": "123456"}) request1 = urllib.request.Request(url, data=data) # POSTæ¹æ³ request2 = urllib.request.Request(url+"?%s" % data) # GETæ¹æ³ response = urllib.request.urlopen(request, timeout=10) # åéHeaderï¼å³å¨Request()䏿·»å headersåæ° request = urllib.request.Request(url, data=data, headers=headers) # 忰䏿·»å headeråæ° request.add_header("Referer", "http://www.baidu.com") # å¦ä¸ç§æ·»å headerçæ¹å¼,æ·»å Refereræ¯ä¸ºäºåºå¯¹"åçé¾" response = urllib.request.urlopen(request, timeout=10) # ç½é¡µæåå¼åå¼å¸¸ï¼urllib.error.HTTPError, urllib.error.URLError, 两è åå¨ç»§æ¿å ³ç³» try: urllib.request.urlopen(request, timeout=10) except urllib.error.HTTPError as e: print(e.code, e.reason) except urllib.error.URLError as e: print(e.errno, e.reason) # 使ç¨ä»£çï¼ä»¥é²æ¢IPè¢«å°æIP次æ°åéï¼ proxy_handler = urllib.request.ProxyHandler(proxies={"http": "111.123.76.12:8080"}) opener = urllib.request.build_opener(proxy_handler) # å©ç¨ä»£çå建openerå®ä¾ response = opener.open(url) # ç´æ¥å©ç¨openerå®ä¾æå¼url urllib.request.install_opener(opener) # å®è£ å ¨å±openerï¼ç¶åå©ç¨urlopenæå¼url response = urllib.request.urlopen(url) # 使ç¨cookieåcookiejar,åºå¯¹æå¡å¨æ£æ¥ cookie_jar = http.cookiejar.CookieJar() cookie_jar_handler = urllib.request.HTTPCookieProcessor(cookiejar=cookie_jar) opener = urllib.request.build_opener(cookie_jar_handler) response = opener.open(url) # åé卿µè§å¨ä¸è·åçcookie,ä¸¤ç§æ¹å¼: # (1)ç´æ¥æ¾å°headersé headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", "Cookie": "PHPSESSID=btqkg9amjrtoeev8coq0m78396; USERINFO=n6nxTHTY%2BJA39z6CpNB4eKN8f0KsYLjAQTwPe%2BhLHLruEbjaeh4ulhWAS5RysUM%2B; " } request = urllib.request.Request(url, headers=headers) # (2)æå»ºcookie,æ·»å å°cookiejarä¸ cookie = http.cookiejar.Cookie(name="xx", value="xx", domain="xx", ...) cookie_jar.set_cookie(cookie) response = opener.open(url) # åæ¶ä½¿ç¨ä»£çåcookiejar opener = urllib.request.build_opener(cookie_jar_handler) opener.add_handler(proxy_handler) response = opener.open("https://www.baidu.com/") # æåç½é¡µä¸çå¾çï¼åæ ·éç¨äºæåç½ç»ä¸çæä»¶ãå³å»é¼ æ ï¼æ¾å°å¾ç屿§ä¸çå°åï¼ç¶åè¿è¡ä¿åã response = urllib.request.urlopen("http://ww3.sinaimg.cn/large/7d742c99tw1ee7dac2766j204q04qmxq.jpg", timeout=120) with open("test.jpg", "wb") as file_img: file_img.write(response.read()) # HTTP认è¯ï¼å³HTTP身份éªè¯ password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() # å建ä¸ä¸ªPasswordMgr password_mgr.add_password(realm=None, uri=url, user='username', passwd='password') # æ·»å ç¨æ·ååå¯ç handler = urllib.request.HTTPBasicAuthHandler(password_mgr) # å建HTTPBasicAuthHandler opener = urllib.request.build_opener(handler) # å建opner response = opener.open(url, timeout=10) # è·åæ°æ®