From dce0a44548837fa45cae4e55bb9c431ff1c223c0 Mon Sep 17 00:00:00 2001 From: zwingser <126068653+zwingser@users.noreply.github.com> Date: Sat, 7 Oct 2023 16:19:34 +0800 Subject: [PATCH 01/13] Update proxyApi.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加txt返回格式. --- api/proxyApi.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/api/proxyApi.py b/api/proxyApi.py index bd2de57e2..0d0cb9c12 100644 --- a/api/proxyApi.py +++ b/api/proxyApi.py @@ -43,7 +43,9 @@ def force_type(cls, response, environ=None): api_list = [ {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, + {"url": "/gettxt", "params": "type: ''https'|''", "desc": "get a proxy"}, {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, + {"url": "/poptxt", "params": "", "desc": "get and delete a proxy"}, {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, {"url": "/count", "params": "", "desc": "return proxy count"} @@ -62,6 +64,11 @@ def get(): proxy = proxy_handler.get(https) return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} +@app.route('/gettxt/') +def gettxt(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.get(https) + return proxy._proxy if proxy else {"code": 0, "src": "no proxy"} @app.route('/pop/') def pop(): @@ -69,6 +76,11 @@ def pop(): proxy = proxy_handler.pop(https) return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} +@app.route('/poptxt/') +def poptxt(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.pop(https) + return proxy._proxy if proxy else {"code": 0, "src": "no proxy"} @app.route('/refresh/') def refresh(): From 2ffb802ac66fa73b650c3c96b53179bce295c0c6 Mon Sep 17 00:00:00 2001 From: zwingser <126068653+zwingser@users.noreply.github.com> Date: Sat, 7 Oct 2023 16:21:51 +0800 Subject: [PATCH 02/13] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加txt返回 --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f48bad803..5c1464f5d 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,9 @@ docker-compose up -d | ----| ---- | ---- | ----| | / | GET | api介绍 | None | | /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /gettxt | GET | 随机获取一个代理,非json,ip:port格式| 可选参数: `?type=https` 过滤支持https的代理| | /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /poptxt | GET | 获取并删除一个代理,非json,ip:port格式| 可选参数: `?type=https` 过滤支持https的代理| | /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| | /count | GET | 查看代理数量 |None| | /delete | GET | 删除代理 |`?proxy=host:ip`| From 8cd712d4547e5b2767a68b879f79247695795dfe Mon Sep 17 00:00:00 2001 From: zwingser <126068653+zwingser@users.noreply.github.com> Date: Sat, 7 Oct 2023 16:41:02 +0800 Subject: [PATCH 03/13] Update Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 镜像安装bash,方便进入维护. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 89019cd7f..cd555003e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositorie RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata # runtime environment -RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ +RUN apk add bash musl-dev gcc libxml2-dev libxslt-dev && \ pip install --no-cache-dir -r requirements.txt && \ apk del gcc musl-dev From 748baa90a6c1093892bcfdb21779ba3cd6aaac41 Mon Sep 17 00:00:00 2001 From: wingser Date: Sat, 7 Oct 2023 18:49:56 +0800 Subject: [PATCH 04/13] =?UTF-8?q?[update]:1=E3=80=81=E6=9B=B4=E6=96=B0'fre?= =?UTF-8?q?eProxy01'=E3=80=81'freeProxy03'=E3=80=81'freeProxy08'=E3=80=81'?= =?UTF-8?q?freeProxy10'=E7=9A=84=E2=80=A6=20=E2=80=A6=E5=88=86=E9=A1=B5?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=EF=BC=8C=E4=BD=BF=E5=85=B6=E8=83=BD=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E6=9B=B4=E5=A4=9A=E7=9A=84=E9=A1=B5=E9=9D=A2=E7=9A=84?= =?UTF-8?q?IP=E3=80=822=E3=80=81=E5=BE=AA=E7=8E=AF=E6=97=B6=E7=BB=9F?= =?UTF-8?q?=E4=B8=80=E5=BB=B6=E8=BF=9F10=E7=A7=92=EF=BC=8C=E9=98=B2?= =?UTF-8?q?=E6=AD=A2=E8=A7=A6=E5=8F=91=E7=BD=91=E7=AB=99=E7=9A=84=E8=AF=B7?= =?UTF-8?q?=E6=B1=82=E4=BF=9D=E6=8A=A4=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fetcher/proxyFetcher.py | 62 ++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 1ed43cbad..1bb9ce1c6 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -12,8 +12,8 @@ """ __author__ = 'JHao' -import re import json +import re from time import sleep from util.webRequest import WebRequest @@ -29,12 +29,12 @@ def freeProxy01(): """ 站大爷 https://www.zdaye.com/dayProxy.html """ - start_url = "https://www.zdaye.com/dayProxy.html" - html_tree = WebRequest().get(start_url, verify=False).tree - latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() + start_url = "https://www.zdaye.com/dayProxy/{}/{}/{}.html" from datetime import datetime + html_tree = WebRequest().get(start_url.format(datetime.now().year, datetime.now().month, 1), verify=False).tree + latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") - if interval.seconds < 300: # 只采集5分钟内的更新 + if interval.seconds < 300: # 只采集5分钟内的更新,当前7个小时更新一次 target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() while target_url: _tree = WebRequest().get(target_url, verify=False).tree @@ -44,7 +44,7 @@ def freeProxy01(): yield "%s:%s" % (ip, port) next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False - sleep(5) + sleep(10) @staticmethod def freeProxy02(): @@ -60,9 +60,14 @@ def freeProxy02(): yield "%s:%s" % (ip, port) @staticmethod - def freeProxy03(): - """ 开心代理 """ - target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] + def freeProxy03(page_count=10): + """ 开心代理 http://www.kxdaili.com/dailiip.html""" + target_url = "http://www.kxdaili.com/dailiip/{}/{}.html" + target_urls = [] + for tabIndex in range(2): + for pageIndex in range(page_count): + target_urls.append(target_url.format(tabIndex + 1, pageIndex + 1)) + for url in target_urls: tree = WebRequest().get(url).tree for tr in tree.xpath("//table[@class='active']//tr")[1:]: @@ -89,7 +94,7 @@ def parse_ip(input_str): yield "%s:%s" % (ip, port) @staticmethod - def freeProxy05(page_count=1): + def freeProxy05(page_count=10): """ 快代理 https://www.kuaidaili.com """ url_pattern = [ 'https://www.kuaidaili.com/free/inha/{}/', @@ -103,7 +108,7 @@ def freeProxy05(page_count=1): for url in url_list: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') - sleep(1) # 必须sleep 不然第二条请求不到数据 + sleep(10) # 必须sleep 不然第二条请求不到数据 for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) @@ -129,16 +134,21 @@ def freeProxy07(): proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) + sleep(10) @staticmethod def freeProxy08(): """ 小幻代理 """ - urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] - for url in urls: - r = WebRequest().get(url, timeout=10) + url = 'https://ip.ihuan.me/' + tree = WebRequest().get(url, verify=False).tree + hrefs = tree.xpath("//ul[@class='pagination']/li/a/@href") + + for href in hrefs: + r = WebRequest().get(url + href, timeout=10) proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) + sleep(10) @staticmethod def freeProxy09(page_count=1): @@ -154,12 +164,20 @@ def freeProxy09(page_count=1): @staticmethod def freeProxy10(): """ 89免费代理 """ - r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) - proxies = re.findall( - r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', - r.text) - for proxy in proxies: - yield ':'.join(proxy) + url = "https://www.89ip.cn/{}.html" + target_url = url.format('index_1') + next_page = True + while next_page: + r = WebRequest().get(target_url, timeout=10) + proxies = re.findall( + r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', r.text) + for proxy in proxies: + yield ':'.join(proxy) + + next_page = r.tree.xpath("//a[@class='layui-laypage-next']/@href") + next_page = next_page[0].strip() if next_page else False + target_url = url.format(next_page) + sleep(10) @staticmethod def freeProxy11(): @@ -236,7 +254,7 @@ def freeProxy11(): if __name__ == '__main__': p = ProxyFetcher() - for _ in p.freeProxy11(): + for _ in p.freeProxy10(): print(_) # http://nntime.com/proxy-list-01.htm @@ -244,4 +262,4 @@ def freeProxy11(): # freeProxy04 # freeProxy07 -# freeProxy08 +# freeProxy08 \ No newline at end of file From a90aff691977620cc46569710e001595c43da749 Mon Sep 17 00:00:00 2001 From: wingser Date: Sun, 8 Oct 2023 18:23:45 +0800 Subject: [PATCH 05/13] =?UTF-8?q?=E6=9B=B4=E6=96=B0dockerfile,=E5=88=9B?= =?UTF-8?q?=E5=BB=BA=E8=87=AA=E5=B7=B1=E7=9A=84=E9=95=9C=E5=83=8F.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index cd555003e..854b99e87 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.6-alpine -MAINTAINER jhao104 +MAINTAINER wingser WORKDIR /app From 619b01842d1482e097ff3576d19422ebaa1872b7 Mon Sep 17 00:00:00 2001 From: wingser Date: Wed, 11 Oct 2023 11:55:04 +0800 Subject: [PATCH 06/13] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99=20wingser01->seo=E6=96=B9?= =?UTF-8?q?=E6=B3=95=20wingser02->=E5=B0=8F=E8=88=92=E4=BB=A3=E7=90=86=20?= =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=A4=B1=E6=95=88=E4=BB=A3=E7=90=86=20freePr?= =?UTF-8?q?oxy09=20=E5=BE=85=E5=A4=84=E7=90=86=20freeProxy01->=E6=97=A0?= =?UTF-8?q?=E6=95=B0=E6=8D=AE,=E5=BE=85=E4=BF=AE=E5=A4=8D=20freeProxy02->?= =?UTF-8?q?=E6=97=A0=E6=95=B0=E6=8D=AE,=E5=BE=85=E4=BF=AE=E5=A4=8D=20freeP?= =?UTF-8?q?roxy04->=E7=BD=91=E5=85=B3=E5=B1=8F=E8=94=BD=E8=AF=B7=E6=B1=82?= =?UTF-8?q?=20freeProxy08->=E6=9C=89cf=E5=AE=89=E5=85=A8=E8=AE=A4=E8=AF=81?= =?UTF-8?q?,=E6=97=A0=E6=B3=95=E6=AD=A3=E5=B8=B8=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 19 ++++++++++++++++ README.md | 7 ++---- fetcher/proxyFetcher.py | 50 +++++++++++++++++++++++++++++++++++++++-- requirements.txt | 3 ++- setting.py | 12 +++++----- 5 files changed, 78 insertions(+), 13 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 000000000..9f078a4e2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + //"justMyCode": true, + //"python": "${command:python.interpreterPath}", + "env": {"PYTHONPATH":"${workspaceRoot}"}, + "envFile": "${workspaceRoot}/.env" + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 5c1464f5d..c0af464ca 100644 --- a/README.md +++ b/README.md @@ -212,16 +212,13 @@ PROXY_FETCHER = [ | 站大爷 | ✔ | ★ | ** | [地址](https://www.zdaye.com/) | [`freeProxy01`](/fetcher/proxyFetcher.py#L28) | | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | - | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | | FateZero | ✔ | ★★ | * | [地址](http://proxylist.fatezero.org) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L124) | - | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L134) | - | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L144) | | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L155) | | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L165) | - - + | SEO方法代理 | ✔ | ☆ | * | [地址](https://proxy.seofangfa.com/) | [`wingser01`](/fetcher/proxyFetcher.py#L194) | + | 小舒代理 | ✔ | ☆ | * | [地址](http://www.xsdaili.cn/) | [`wingser01`](/fetcher/proxyFetcher.py#L206) | 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 ### 问题反馈 diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 1bb9ce1c6..3842fca52 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -17,6 +17,7 @@ from time import sleep from util.webRequest import WebRequest +from pyquery import PyQuery as pq class ProxyFetcher(object): @@ -27,9 +28,9 @@ class ProxyFetcher(object): @staticmethod def freeProxy01(): """ - 站大爷 https://www.zdaye.com/dayProxy.html """ start_url = "https://www.zdaye.com/dayProxy/{}/{}/{}.html" + 站大爷 https://www.zdaye.com/dayProxy.html from datetime import datetime html_tree = WebRequest().get(start_url.format(datetime.now().year, datetime.now().month, 1), verify=False).tree latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() @@ -189,6 +190,51 @@ def freeProxy11(): except Exception as e: print(e) + @staticmethod + def wingser01(): + """ + seo方法 crawler, https://proxy.seofangfa.com/ + """ + url = 'https://proxy.seofangfa.com/' + html_tree = WebRequest().get(url, verify=False).tree + for index, tr in enumerate(html_tree.xpath("//table//tr")): + if index == 0: + continue + yield ":".join(tr.xpath("./td/text()")[0:2]).strip() + + @staticmethod + def wingser02(): + """ + 小舒代理 crawler, http://www.xsdaili.cn/ + """ + url = 'http://www.xsdaili.cn/' + base_url = "http://www.xsdaili.cn/dayProxy/ip/{page}.html" + + '''通过网站,获取最近10个日期的共享''' + urls = [] + html = WebRequest().get(url, verify=False).tree + doc = pq(html) + title = doc(".title:eq(0) a").items() + latest_page = 0 + for t in title: + res = re.search(r"/(\d+)\.html", t.attr("href")) + latest_page = int(res.group(1)) if res else 0 + if latest_page: + urls = [base_url.format(page=page) for page in range(latest_page - 10, latest_page)] + else: + urls = [] + + '''每个日期的网站,爬proxy''' + for u in urls: + h = WebRequest().get(u, verify=False).tree + doc = pq(h) + contents = doc('.cont').text() + contents = contents.split("\n") + for content in contents: + yield content[:content.find("@")] + + + # @staticmethod # def wallProxy01(): # """ @@ -254,7 +300,7 @@ def freeProxy11(): if __name__ == '__main__': p = ProxyFetcher() - for _ in p.freeProxy10(): + for _ in p.freeProxy01(): print(_) # http://nntime.com/proxy-list-01.htm diff --git a/requirements.txt b/requirements.txt index 53dc129b7..c762e2cd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ requests==2.20.0 gunicorn==19.9.0 -lxml==4.9.2 +lxml==4.9.3 redis==3.5.3 APScheduler==3.10.0;python_version>="3.10" APScheduler==3.2.0;python_version<"3.10" @@ -10,3 +10,4 @@ Flask==2.1.1;python_version>"3.6" Flask==1.0;python_version<="3.6" werkzeug==2.1.0;python_version>"3.6" werkzeug==0.15.5;python_version<="3.6" +pyquery>=1.4.3,<2.0.0 \ No newline at end of file diff --git a/setting.py b/setting.py index 9bab8475c..5055f3359 100644 --- a/setting.py +++ b/setting.py @@ -48,21 +48,23 @@ "freeProxy01", "freeProxy02", "freeProxy03", - "freeProxy04", +# "freeProxy04", "freeProxy05", "freeProxy06", "freeProxy07", - "freeProxy08", - "freeProxy09", +# "freeProxy08", +# "freeProxy09", "freeProxy10", - "freeProxy11" + "freeProxy11", + "wingser01", + "wingser02" ] # ############# proxy validator ################# # 代理验证目标网站 HTTP_URL = "http://httpbin.org" -HTTPS_URL = "https://www.qq.com" +HTTPS_URL = "https://jd.com" # 代理验证时超时时间 VERIFY_TIMEOUT = 10 From eff51c0b27c661a97314c6df40463d9229c23ed5 Mon Sep 17 00:00:00 2001 From: wingser Date: Thu, 12 Oct 2023 11:25:49 +0800 Subject: [PATCH 07/13] =?UTF-8?q?=E4=B8=B2=E8=A1=8C=E4=BA=86,=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E4=B8=AAbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 ++- fetcher/proxyFetcher.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c0af464ca..f04d5cee4 100644 --- a/README.md +++ b/README.md @@ -219,7 +219,8 @@ PROXY_FETCHER = [ | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L165) | | SEO方法代理 | ✔ | ☆ | * | [地址](https://proxy.seofangfa.com/) | [`wingser01`](/fetcher/proxyFetcher.py#L194) | | 小舒代理 | ✔ | ☆ | * | [地址](http://www.xsdaili.cn/) | [`wingser01`](/fetcher/proxyFetcher.py#L206) | - 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 + +如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 ### 问题反馈 diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 3842fca52..887116472 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -28,9 +28,9 @@ class ProxyFetcher(object): @staticmethod def freeProxy01(): """ + 站大爷 https://www.zdaye.com/dayProxy.html """ start_url = "https://www.zdaye.com/dayProxy/{}/{}/{}.html" - 站大爷 https://www.zdaye.com/dayProxy.html from datetime import datetime html_tree = WebRequest().get(start_url.format(datetime.now().year, datetime.now().month, 1), verify=False).tree latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() From 2ce467cd9cb3448ba6f62dae7a86a5366b5e5811 Mon Sep 17 00:00:00 2001 From: zwingser <126068653+zwingser@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:45:11 +0800 Subject: [PATCH 08/13] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 串行了 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c0af464ca..e7b2ad444 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,7 @@ PROXY_FETCHER = [ | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L165) | | SEO方法代理 | ✔ | ☆ | * | [地址](https://proxy.seofangfa.com/) | [`wingser01`](/fetcher/proxyFetcher.py#L194) | | 小舒代理 | ✔ | ☆ | * | [地址](http://www.xsdaili.cn/) | [`wingser01`](/fetcher/proxyFetcher.py#L206) | + 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 ### 问题反馈 From 76bd4230a21151baad4150e40fe15d1ac48d81e9 Mon Sep 17 00:00:00 2001 From: zwingser <126068653+zwingser@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:47:13 +0800 Subject: [PATCH 09/13] Update proxyFetcher.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 串行了,修复一下. --- fetcher/proxyFetcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 3842fca52..09f4e118e 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -28,9 +28,9 @@ class ProxyFetcher(object): @staticmethod def freeProxy01(): """ + 站大爷 https://www.zdaye.com/dayProxy.html """ start_url = "https://www.zdaye.com/dayProxy/{}/{}/{}.html" - 站大爷 https://www.zdaye.com/dayProxy.html from datetime import datetime html_tree = WebRequest().get(start_url.format(datetime.now().year, datetime.now().month, 1), verify=False).tree latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() @@ -308,4 +308,4 @@ def wingser02(): # freeProxy04 # freeProxy07 -# freeProxy08 \ No newline at end of file +# freeProxy08 From 098232ef55de4f62c0dd3e9c28366857588b21b7 Mon Sep 17 00:00:00 2001 From: wingser Date: Thu, 12 Oct 2023 15:28:22 +0800 Subject: [PATCH 10/13] =?UTF-8?q?=E6=96=B0=E5=A2=9Ewingser03=E3=80=8104?= =?UTF-8?q?=E3=80=8105=E4=BB=A3=E7=90=86=20=E8=B0=83=E6=95=B4GA=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E6=9E=84=E5=BB=BA=E9=95=9C=E5=83=8F.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/docker-image-latest.yml | 4 +- .github/workflows/docker-image-tags.yml | 2 +- README.md | 25 +++-- fetcher/proxyFetcher.py | 126 ++++++++++------------ setting.py | 3 +- 5 files changed, 80 insertions(+), 80 deletions(-) diff --git a/.github/workflows/docker-image-latest.yml b/.github/workflows/docker-image-latest.yml index 6c7e00ac6..5019fc979 100644 --- a/.github/workflows/docker-image-latest.yml +++ b/.github/workflows/docker-image-latest.yml @@ -25,11 +25,11 @@ jobs: id: meta uses: docker/metadata-action@v3 with: - images: jhao104/proxy_pool + images: wingser/proxy_pool - name: Build and push Docker image uses: docker/build-push-action@v2 with: context: . push: true - tags: jhao104/proxy_pool:latest + tags: wingser/proxy_pool:latest diff --git a/.github/workflows/docker-image-tags.yml b/.github/workflows/docker-image-tags.yml index 9a59645ad..b3c8d0176 100644 --- a/.github/workflows/docker-image-tags.yml +++ b/.github/workflows/docker-image-tags.yml @@ -25,7 +25,7 @@ jobs: id: meta uses: docker/metadata-action@v3 with: - images: jhao104/proxy_pool + images: wingser/proxy_pool - name: Build and push Docker image uses: docker/build-push-action@v2 diff --git a/README.md b/README.md index e7b2ad444..1ecf296b3 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ ProxyPool 爬虫代理IP池 爬虫代理IP池项目,主要功能为定时采集网上发布的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式。同时你也可以扩展代理源以增加代理池IP的质量和数量。 +[原作者项目地址](https://github.com/jhao104/proxy_pool) 感谢jhao104之前项目贡献,因原项目更新缓慢,自己fork开启更新维护。欢迎提建议,我尽量更新,如果我更新也慢了,你可以考虑自己从原项目fork一份自己维护。 + * 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) * 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) @@ -43,13 +45,14 @@ ProxyPool 爬虫代理IP池 * git clone ```bash -git clone git@github.com:jhao104/proxy_pool.git +git clone git@github.com:wingser/proxy_pool.git ``` * releases ```bash -https://github.com/jhao104/proxy_pool/releases 下载对应zip文件 +https://github.com/wingser/proxy_pool/releases 下载对应zip文件 +建议docker安装。 ``` ##### 安装依赖: @@ -101,9 +104,9 @@ python proxyPool.py server ### Docker Image ```bash -docker pull jhao104/proxy_pool +docker pull wingser/proxy_pool -docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest +docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 --name wingser_pool wingser/proxy_pool:latest ``` ### docker-compose @@ -218,13 +221,17 @@ PROXY_FETCHER = [ | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L155) | | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L165) | | SEO方法代理 | ✔ | ☆ | * | [地址](https://proxy.seofangfa.com/) | [`wingser01`](/fetcher/proxyFetcher.py#L194) | - | 小舒代理 | ✔ | ☆ | * | [地址](http://www.xsdaili.cn/) | [`wingser01`](/fetcher/proxyFetcher.py#L206) | - - 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 + | 小舒代理 | ✔ | ☆ | * | [地址](http://www.xsdaili.cn/) | [`wingser02`](/fetcher/proxyFetcher.py#L206) | + | PzzQz代理 | ✔ | ☆ | * | [地址](https://pzzqz.com/) | [`wingser03`](/fetcher/proxyFetcher.py#L244) | + | proxy-list | ✔ | ☆ | * | [地址](https://proxy-list.org/) | [`wingser04`](/fetcher/proxyFetcher.py#L269) | + | proxylistplus| ✔ | ☆ | * | [地址](https://list.proxylistplus.com/)| [`wingser05`](/fetcher/proxyFetcher.py#L284) | + + +如果还有其他好的免费代理网站, 可以在提交在[Issues](https://github.com/zwingser/proxy_pool/issues), 下次更新时会考虑在项目中支持。 ### 问题反馈 -  任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,同时也可以到我的[博客](http://www.spiderpy.cn/blog/message)中留言。 +  任何问题欢迎在[Issues](https://github.com/zwingser/proxy_pool/issues) 中反馈。   你的反馈会让此项目变得更加完美。 @@ -232,7 +239,7 @@ PROXY_FETCHER = [   本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 -  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。 +  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/zwingser/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。   这里感谢以下contributor的无私奉献: diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 09f4e118e..5463555cb 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -29,6 +29,7 @@ class ProxyFetcher(object): def freeProxy01(): """ 站大爷 https://www.zdaye.com/dayProxy.html + 好像屏蔽了国外服务器,国内可以正常爬取. """ start_url = "https://www.zdaye.com/dayProxy/{}/{}/{}.html" from datetime import datetime @@ -164,7 +165,11 @@ def freeProxy09(page_count=1): @staticmethod def freeProxy10(): - """ 89免费代理 """ + """ + 89免费代理 + 怀疑封国外请求,境外服务器爬取异常. + + """ url = "https://www.89ip.cn/{}.html" target_url = url.format('index_1') next_page = True @@ -235,77 +240,64 @@ def wingser02(): - # @staticmethod - # def wallProxy01(): - # """ - # PzzQz https://pzzqz.com/ - # """ - # from requests import Session - # from lxml import etree - # session = Session() - # try: - # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text - # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) - # if x_csrf_token: - # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} - # proxy_resp = session.post("https://pzzqz.com/", verify=False, - # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() - # tree = etree.HTML(proxy_resp["proxy_html"]) - # for tr in tree.xpath("//tr"): - # ip = "".join(tr.xpath("./td[1]/text()")) - # port = "".join(tr.xpath("./td[2]/text()")) - # yield "%s:%s" % (ip, port) - # except Exception as e: - # print(e) - - # @staticmethod - # def freeProxy10(): - # """ - # 墙外网站 cn-proxy - # :return: - # """ - # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] - # request = WebRequest() - # for url in urls: - # r = request.get(url, timeout=10) - # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) - # for proxy in proxies: - # yield ':'.join(proxy) - - # @staticmethod - # def freeProxy11(): - # """ - # https://proxy-list.org/english/index.php - # :return: - # """ - # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] - # request = WebRequest() - # import base64 - # for url in urls: - # r = request.get(url, timeout=10) - # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) - # for proxy in proxies: - # yield base64.b64decode(proxy).decode() - - # @staticmethod - # def freeProxy12(): - # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] - # request = WebRequest() - # for url in urls: - # r = request.get(url, timeout=10) - # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) - # for proxy in proxies: - # yield ':'.join(proxy) + @staticmethod + def wingser03(): + """ + PzzQz https://pzzqz.com/ + """ + from requests import Session + from lxml import etree + session = Session() + try: + index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text + x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) + if x_csrf_token: + data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} + proxy_resp = session.post("https://pzzqz.com/", verify=False, + headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() + tree = etree.HTML(proxy_resp["proxy_html"]) + for tr in tree.xpath("//tr"): + ip = "".join(tr.xpath("./td[1]/text()")) + port = "".join(tr.xpath("./td[2]/text()")) + yield "%s:%s" % (ip, port) + except Exception as e: + print(e) + + + + @staticmethod + def wingser04(): + """ + https://proxy-list.org/english/index.php + :return: + """ + urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] + request = WebRequest() + import base64 + for url in urls: + r = request.get(url, timeout=10) + proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) + for proxy in proxies: + yield base64.b64decode(proxy).decode() + + @staticmethod + def wingser05(): + urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] + request = WebRequest() + for url in urls: + r = request.get(url, timeout=10) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) if __name__ == '__main__': p = ProxyFetcher() - for _ in p.freeProxy01(): + for _ in p.wingser05(): print(_) -# http://nntime.com/proxy-list-01.htm -# freeProxy04 -# freeProxy07 -# freeProxy08 + +# http://nntime.com/proxy-list-01.htm + diff --git a/setting.py b/setting.py index 5055f3359..dbd82db2e 100644 --- a/setting.py +++ b/setting.py @@ -57,7 +57,8 @@ "freeProxy10", "freeProxy11", "wingser01", - "wingser02" + "wingser02", + "wingser03" ] # ############# proxy validator ################# From 56e72185475b4b9e168c45d8142f30e260083bc2 Mon Sep 17 00:00:00 2001 From: wingser Date: Thu, 12 Oct 2023 15:43:43 +0800 Subject: [PATCH 11/13] =?UTF-8?q?=E5=BF=98=E8=AE=B0=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E4=BB=A3=E7=90=86=E6=96=B9=E6=B3=95=E4=BA=86?= =?UTF-8?q?,=E8=A1=A5=E5=85=85=E4=B8=8A.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setting.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/setting.py b/setting.py index dbd82db2e..436a91342 100644 --- a/setting.py +++ b/setting.py @@ -48,17 +48,16 @@ "freeProxy01", "freeProxy02", "freeProxy03", -# "freeProxy04", "freeProxy05", "freeProxy06", "freeProxy07", -# "freeProxy08", -# "freeProxy09", "freeProxy10", "freeProxy11", "wingser01", "wingser02", - "wingser03" + "wingser03", + "wingser04", + "wingser05" ] # ############# proxy validator ################# From 9e6077433bbf8cd4b8f7df9e8cebee43bca877eb Mon Sep 17 00:00:00 2001 From: wingser Date: Thu, 12 Oct 2023 18:50:51 +0800 Subject: [PATCH 12/13] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=96=B0=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E7=A8=8B=E5=BA=8F=E4=B8=BA=E8=87=AA=E5=8A=A8=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD=EF=BC=8C=E4=B9=8B=E5=89=8D=E6=98=AF=E9=9C=80=E8=A6=81?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6=E9=85=8D=E7=BD=AE=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E5=90=8D=E3=80=82=E4=B8=8D=E6=96=B9=E4=BE=BF=E3=80=82?= =?UTF-8?q?=20=E5=90=8C=E6=97=B6=EF=BC=8C=E9=99=8D=E6=9A=82=E6=97=B6?= =?UTF-8?q?=E5=A4=B1=E6=95=88=E5=92=8C=E8=B0=83=E8=AF=95=E4=B8=AD=E7=9A=84?= =?UTF-8?q?=E7=88=AC=E8=99=AB=EF=BC=8C=E7=A7=BB=E5=88=B0configFetcherBak?= =?UTF-8?q?=E9=87=8C=E9=9D=A2=EF=BC=8C=E9=81=BF=E5=85=8D=E5=90=AF=E5=8A=A8?= =?UTF-8?q?=E6=97=B6=E5=80=99=E8=87=AA=E5=8A=A8=E5=8A=A0=E8=BD=BD=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 23 +---------- fetcher/proxyFetcher.py | 43 -------------------- fetcher/proxyFetcherBak.py | 82 ++++++++++++++++++++++++++++++++++++++ handler/configHandler.py | 17 +++++--- setting.py | 31 +++++++------- util/six.py | 8 ++-- 6 files changed, 114 insertions(+), 90 deletions(-) create mode 100644 fetcher/proxyFetcherBak.py diff --git a/README.md b/README.md index 1ecf296b3..bb8be0c41 100644 --- a/README.md +++ b/README.md @@ -77,14 +77,6 @@ PORT = 5000 # 监听端口 DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' - -# 配置 ProxyFetcher - -PROXY_FETCHER = [ - "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py - "freeProxy02", - # .... -] ``` #### 启动项目: @@ -190,20 +182,7 @@ class ProxyFetcher(object): # 确保每个proxy都是 host:ip正确的格式返回 ``` -* 2、添加好方法后,修改[setting.py](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项: - -  在`PROXY_FETCHER`下添加自定义方法的名字: - -```python -PROXY_FETCHER = [ - "freeProxy01", - "freeProxy02", - # .... - "freeProxyCustom1" # # 确保名字和你添加方法名字一致 -] -``` - - +* 2、添加好方法后,改为自动加载,无需配置。(原设计不太合理,我自己提交都漏掉几次,直接改自动加载):   `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 ### 免费代理源 diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 5463555cb..43fbb345f 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -77,24 +77,6 @@ def freeProxy03(page_count=10): port = "".join(tr.xpath('./td[2]/text()')).strip() yield "%s:%s" % (ip, port) - @staticmethod - def freeProxy04(): - """ FreeProxyList https://www.freeproxylists.net/zh/ """ - url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" - tree = WebRequest().get(url, verify=False).tree - from urllib import parse - - def parse_ip(input_str): - html_str = parse.unquote(input_str) - ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) - return ips[0] if ips else None - - for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): - ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) - port = "".join(tr.xpath('./td[2]/text()')).strip() - if ip: - yield "%s:%s" % (ip, port) - @staticmethod def freeProxy05(page_count=10): """ 快代理 https://www.kuaidaili.com """ @@ -138,31 +120,6 @@ def freeProxy07(): yield ":".join(proxy) sleep(10) - @staticmethod - def freeProxy08(): - """ 小幻代理 """ - url = 'https://ip.ihuan.me/' - tree = WebRequest().get(url, verify=False).tree - hrefs = tree.xpath("//ul[@class='pagination']/li/a/@href") - - for href in hrefs: - r = WebRequest().get(url + href, timeout=10) - proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) - for proxy in proxies: - yield ":".join(proxy) - sleep(10) - - @staticmethod - def freeProxy09(page_count=1): - """ 免费代理库 """ - for i in range(1, page_count + 1): - url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) - html_tree = WebRequest().get(url, verify=False).tree - for index, tr in enumerate(html_tree.xpath("//table//tr")): - if index == 0: - continue - yield ":".join(tr.xpath("./td/text()")[0:2]).strip() - @staticmethod def freeProxy10(): """ diff --git a/fetcher/proxyFetcherBak.py b/fetcher/proxyFetcherBak.py new file mode 100644 index 000000000..4cd35f3e1 --- /dev/null +++ b/fetcher/proxyFetcherBak.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxyFetcherBak + Description : 原文件改为自动加载爬虫程序,所以调试中或者失效的程序也会加载, + 把调试中的程序,或者失效程序,放到这个文件里面. + Author : wingser + date: 2016/11/25 +------------------------------------------------- + Change Activity: + 2016/11/25: proxyFetcherBak +------------------------------------------------- +""" +__author__ = 'JHao' + +import json +import re +from time import sleep + +from util.webRequest import WebRequest +from pyquery import PyQuery as pq + + +class ProxyFetcherBak(object): + """ + proxy getter + """ + + @staticmethod + def freeProxy04(): + """ FreeProxyList https://www.freeproxylists.net/zh/ """ + url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" + tree = WebRequest().get(url, verify=False).tree + from urllib import parse + + def parse_ip(input_str): + html_str = parse.unquote(input_str) + ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) + return ips[0] if ips else None + + for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): + ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) + port = "".join(tr.xpath('./td[2]/text()')).strip() + if ip: + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy08(): + """ 小幻代理 """ + url = 'https://ip.ihuan.me/' + tree = WebRequest().get(url, verify=False).tree + hrefs = tree.xpath("//ul[@class='pagination']/li/a/@href") + + for href in hrefs: + r = WebRequest().get(url + href, timeout=10) + proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + sleep(10) + + @staticmethod + def freeProxy09(page_count=1): + """ 免费代理库 """ + for i in range(1, page_count + 1): + url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) + html_tree = WebRequest().get(url, verify=False).tree + for index, tr in enumerate(html_tree.xpath("//table//tr")): + if index == 0: + continue + yield ":".join(tr.xpath("./td/text()")[0:2]).strip() + + +if __name__ == '__main__': + p = ProxyFetcherBak() + for _ in p.freeProxy09(): + print(_) + + + + +# http://nntime.com/proxy-list-01.htm + diff --git a/handler/configHandler.py b/handler/configHandler.py index 29000bcc6..a68f49b95 100644 --- a/handler/configHandler.py +++ b/handler/configHandler.py @@ -12,16 +12,18 @@ """ __author__ = 'JHao' -import os +import os, inspect import setting +from fetcher.proxyFetcher import ProxyFetcher from util.singleton import Singleton from util.lazyProperty import LazyProperty -from util.six import reload_six, withMetaclass +from util.six import withMetaclass class ConfigHandler(withMetaclass(Singleton)): def __init__(self): + self.fetchersMethord = [method for method in dir(ProxyFetcher) if callable(getattr(ProxyFetcher, method)) and not method.startswith("__")] pass @LazyProperty @@ -40,11 +42,10 @@ def dbConn(self): def tableName(self): return os.getenv("TABLE_NAME", setting.TABLE_NAME) - @property + @LazyProperty def fetchers(self): - reload_six(setting) - return setting.PROXY_FETCHER - + return [method for method in dir(ProxyFetcher) if callable(getattr(ProxyFetcher, method)) and not method.startswith("__")] + @LazyProperty def httpUrl(self): return os.getenv("HTTP_URL", setting.HTTP_URL) @@ -81,3 +82,7 @@ def proxyRegion(self): def timezone(self): return os.getenv("TIMEZONE", setting.TIMEZONE) + +if __name__ == '__main__': + config = ConfigHandler() + print(config.fetchers) diff --git a/setting.py b/setting.py index 436a91342..208e12d8f 100644 --- a/setting.py +++ b/setting.py @@ -44,21 +44,22 @@ # ###### config the proxy fetch function ###### -PROXY_FETCHER = [ - "freeProxy01", - "freeProxy02", - "freeProxy03", - "freeProxy05", - "freeProxy06", - "freeProxy07", - "freeProxy10", - "freeProxy11", - "wingser01", - "wingser02", - "wingser03", - "wingser04", - "wingser05" -] +# 改为自动加载,不需要配置. +# PROXY_FETCHER = [ +# "freeProxy01", +# "freeProxy02", +# "freeProxy03", +# "freeProxy05", +# "freeProxy06", +# "freeProxy07", +# "freeProxy10", +# "freeProxy11", +# "wingser01", +# "wingser02", +# "wingser03", +# "wingser04", +# "wingser05" +# ] # ############# proxy validator ################# # 代理验证目标网站 diff --git a/util/six.py b/util/six.py index 14ee059ba..7d858834c 100644 --- a/util/six.py +++ b/util/six.py @@ -29,10 +29,10 @@ def iteritems(d, **kw): else: from urlparse import urlparse -if PY3: - from imp import reload as reload_six -else: - reload_six = reload +# if PY3: +# from imp import reload as reload_six +# else: +# reload_six = reload if PY3: from queue import Empty, Queue From bc3a1680b8569ee1a1d109bbc689db5b5e72f90e Mon Sep 17 00:00:00 2001 From: wingser Date: Wed, 6 Dec 2023 14:06:36 +0800 Subject: [PATCH 13/13] =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=A4=AA=E5=A4=A7=E4=BA=86,=E9=BB=98=E8=AE=A4=E4=B8=8D?= =?UTF-8?q?=E5=81=9A=E6=96=87=E4=BB=B6=E8=BE=93=E5=87=BA,=E5=8F=AA?= =?UTF-8?q?=E8=BE=93=E5=87=BA=E5=88=B0=E5=B1=8F=E5=B9=95.=E4=B9=8B?= =?UTF-8?q?=E5=89=8D=E6=8A=8A=E6=88=91=E7=9A=84vps=E7=A1=AC=E7=9B=98?= =?UTF-8?q?=E5=B9=B2=E6=8E=89=E4=BA=86=E5=87=A0=E5=8D=81=E4=B8=AAG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- handler/logHandler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handler/logHandler.py b/handler/logHandler.py index 45cd1201d..84e943836 100644 --- a/handler/logHandler.py +++ b/handler/logHandler.py @@ -46,7 +46,7 @@ class LogHandler(logging.Logger): LogHandler """ - def __init__(self, name, level=DEBUG, stream=True, file=True): + def __init__(self, name, level=DEBUG, stream=True, file=False): self.name = name self.level = level logging.Logger.__init__(self, self.name, level=level)