diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 7fd476c6d6..8ee66e8c44 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -2,6 +2,10 @@ ## Unreleased +### Enhancements + +- **No longer proxies images from `*.wp.com` when generating Telegraph posts**: `*.wp.com` is in the blocklist of `wsrv.nl` (environment variable `IMAGES_WESERV_NL`). Thus, these images are no longer proxied when generating Telegraph posts. All images from `*.wp.com` can be accessed with any referer header, so they are now kept as is. + ### Bug fixes - **Canonical `DATABASE_URL` not recognized**: Since v2.9.0, `DATABASE_URL` is canonicalized before connecting to the corresponding database. However, a canonical URL pointing to a local path cannot be recognized when checking the validity of the scheme (database type). Both canonical (`scheme:/path/to/file.db`) and traditional (`scheme:///path/to/file.db`) forms of such URLs are recognized correctly now. diff --git a/docs/CHANGELOG.zh.md b/docs/CHANGELOG.zh.md index 61788bbec9..12a639896c 100644 --- a/docs/CHANGELOG.zh.md +++ b/docs/CHANGELOG.zh.md @@ -2,6 +2,10 @@ ## 未发布 +### 增强 + +- **生成 Telegraph 文章时,不再代理来自 `*.wp.com` 的图像**: `*.wp.com` 位于 `wsrv.nl` (环境变量 `IMAGES_WESERV_NL`) 的阻断列表中。因此,在生成 Telegraph 文章时,这些图像不再被代理。来自 `*.wp.com` 的所有图片都可以用任何 refer 头访问,因此它们现在保持原样。 + ### Bug 修复 - **无法识别规范的 `DATABASE_URL`**: 自 v2.9.0 起, 在连接到相应的数据库之前,`DATABASE_URL` 被规范化。然而,在检查 scheme (数据库类型) 的合法性时,无法识别指向本地路径的规范 URL。现在,此类 URL 的规范 (`scheme:/path/to/file.db`) 和传统 (`scheme:///path/to/file.db`) 形式都被正确识别。 diff --git a/src/parsing/tgraph.py b/src/parsing/tgraph.py index 5702db40d2..ad116e03dc 100644 --- a/src/parsing/tgraph.py +++ b/src/parsing/tgraph.py @@ -20,8 +20,9 @@ from collections.abc import Awaitable import asyncio -import time import aiographfix as aiograph +import re +import time from io import BytesIO from bs4 import BeautifulSoup from contextlib import suppress @@ -40,6 +41,29 @@ else: convert_table_to_png = None +DOMAIN_PATTERN_TEMPLATE: Final[str] = r'^https?://(?:[^./]+\.)?(?:{domains})\.?(?:/|:|$)' +BLOCKED_BY_WESERV_DOMAIN: Final[set[str]] = { + 'sinaimg.cn', + 'wp.com', +} +BLOCKED_BY_WESERV_RE: Final[re.Pattern] = re.compile( + DOMAIN_PATTERN_TEMPLATE.format( + domains='|'.join(map(re.escape, BLOCKED_BY_WESERV_DOMAIN)), + ), + re.I, +) +ALLOW_REFERER_DOMAIN: Final[set[str]] = set(filter(None, { + 'wp.com', + env.IMG_RELAY_SERVER.partition('://')[2].partition('/')[0].strip('.'), + env.IMAGES_WESERV_NL.partition('://')[2].partition('/')[0].strip('.'), +})) +ALLOW_REFERER_RE: Final[re.Pattern] = re.compile( + DOMAIN_PATTERN_TEMPLATE.format( + domains='|'.join(map(re.escape, ALLOW_REFERER_DOMAIN)), + ), + re.I, +) + logger = log.getLogger('RSStT.tgraph') apis: Optional[APIs] = None @@ -309,13 +333,15 @@ async def generate_page(self): if not isAbsoluteHttpLink(attr_content): tag.replaceWithChildren() continue - if not attr_content.startswith(env.IMG_RELAY_SERVER): + if not ALLOW_REFERER_RE.match(attr_content): if tag.name == 'video': attr_content = env.IMG_RELAY_SERVER + attr_content - if tag.name == 'img' and not attr_content.startswith(env.IMAGES_WESERV_NL): - if attr_content.split('.', 1)[1].split('/', 1)[0] == 'sinaimg.cn': - attr_content = env.IMG_RELAY_SERVER + attr_content - attr_content = construct_weserv_url(attr_content) + elif tag.name == 'img': + attr_content = ( + env.IMG_RELAY_SERVER + attr_content + if BLOCKED_BY_WESERV_RE.match(attr_content) + else construct_weserv_url(attr_content) + ) tag.attrs = {attr_name: attr_content} if self.feed_title: