尚未完善,有待改进
#!/usr/bin/env python3# -*- coding: utf-8 -*-__author__ = 'jiangwenwen'import pdfkitimport timeimport requestsimport randomfrom bs4 import BeautifulSoupfrom fake_useragent import UserAgent# 请求头ua = UserAgent()headers = { 'cache-control': "no-cache", "Host": "www.yinwang.org", "User-Agent": ua.random, "Referer": "http://www.yinwang.org/",}# IP代理池ip_pool = ['123.55.114.217:9999', '110.52.235.91:9999', '183.163.43.61:9999', '119.101.126.52:9999', '119.101.124.165:9999', '119.101.125.38:9999', '119.101.125.84:9999', '110.52.235.80:9999', '119.101.125.49:9999', '110.52.235.162:9999', '119.101.124.23:9999' ]# 打印成pdfdef print_pdf(url, file_name): start = time.time() print("正在打印中...") headers["User-Agent"] = ua.random print("User-Agent是:{0}".format(headers["User-Agent"])) content = requests.get(url, headers=headers, timeout=3, proxies=get_proxy(ip_pool)).text pdfkit.from_string(content, file_name) end = time.time() print("打印成功,本次打印耗时:%0.2f秒" % (end - start))# 获得有效代理def get_proxy(ip_pool): for ip in ip_pool: url = "http://www.yinwang.org/" # 用requests来验证ip是否可用 try: requests.get(url, proxies={"http": "http://{}".format(ip), }, timeout=3) except: continue else: proxies = { "http": "http://{}".format(ip), "https": "http://{}".format(ip), } return proxiesresponse = requests.get("http://www.yinwang.org/", headers=headers, proxies=get_proxy(ip_pool))soup = BeautifulSoup(response.content, 'html.parser')tags = soup.find_all("li", class_="list-group-item title")for child in tags: article_url = "http://www.yinwang.org" + child.a.get('href') article_file_name = "桌面\\" + child.a.string + ".pdf" print_pdf(article_url, article_file_name)