From fbe14a2745b8af4a6c9a87d27f9c7cb6698e4370 Mon Sep 17 00:00:00 2001 From: blankie Date: Tue, 12 Dec 2023 20:54:34 +1100 Subject: [PATCH 1/2] [postmill] add support --- docs/configuration.rst | 10 ++ docs/supportedsites.md | 10 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/postmill.py | 204 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 7 ++ test/results/raddle.py | 112 +++++++++++++++++ 6 files changed, 344 insertions(+) create mode 100644 gallery_dl/extractor/postmill.py create mode 100644 test/results/raddle.py diff --git a/docs/configuration.rst b/docs/configuration.rst index a749743c42d..c49dc2c91da 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2734,6 +2734,16 @@ Description Also search Plurk comments for URLs. +extractor.[postmill].save-link-post-body +------------------------ +Type + ``bool`` +Default + ``false`` +Description + Whether or not to save the body for link/image posts. + + extractor.reactor.gif --------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 003dcaa9793..ce490aa2e40 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1316,6 +1316,16 @@ Consider all sites to be NSFW unless otherwise known. + + Postmill Instances + + + Raddle + https://raddle.me/ + Forums, Home Feed, Individual Posts, Search Results, Tag Searches, User Profiles + + + Reactor Instances diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d074de22ebe..695b8b2a267 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -124,6 +124,7 @@ "poipiku", "pornhub", "pornpics", + "postmill", "pururin", "reactor", "readcomiconline", diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py new file mode 100644 index 00000000000..4d4b38a2e29 --- /dev/null +++ b/gallery_dl/extractor/postmill.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Postmill instances""" + +import re +import urllib.parse +from .common import BaseExtractor, Message +from .. import text, exception + + +class PostmillExtractor(BaseExtractor): + """Base class for Postmill extractors""" + basecategory = "postmill" + directory_fmt = ("{category}", "{instance}", "{forum}") + filename_fmt = "{id}_{title[:220]}.{extension}" + archive_fmt = "{filename}" + + def _init(self): + self.instance = self.root.partition("://")[2] + self.save_link_post_body = self.config("save-link-post-body", False) + self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search + self._search_image_tag = re.compile( + r'')) + date = text.parse_datetime(extr( + '')) + username = extr( + '') + post_canonical_url = text.unescape(extr( + '')) + + url = text.unescape(extr( + '

', + '') + + match = self._search_canonical_url(post_canonical_url) + forum = match.group(1) + id = int(match.group(2)) + + is_text_post = url.startswith("/") + is_image_post = self._search_image_tag(response.text) is not None + data = { + "title": title, + "date": date, + "username": username, + "forum": forum, + "id": id, + "flair": [text.unescape(i) for i in text.extract_iter( + response.text, '', '')], + "instance": self.instance, + } + + urls = [] + if is_text_post or self.save_link_post_body: + urls.append((Message.Url, "text:" + body)) + + if is_image_post: + urls.append((Message.Url, url)) + elif not is_text_post: + urls.append((Message.Queue, url)) + + data["count"] = len(urls) + yield Message.Directory, data + for data["num"], (msg, url) in enumerate(urls, 1): + if url.startswith("text:"): + data["filename"], data["extension"] = "", "htm" + else: + data = text.nameext_from_url(url, data) + + yield msg, url, data + + +class PostmillSubmissionsExtractor(PostmillExtractor): + """Base class for Postmill submissions extractors""" + whitelisted_parameters = () + + def __init__(self, match): + PostmillExtractor.__init__(self, match) + self.base = match.group(3) + self.sorting_path = match.group(4) or "" + self.query = {key: value for key, value in text.parse_query( + match.group(5) or "").items() if self.acceptable_query(key)} + + def items(self): + url = self.root + self.base + self.sorting_path + if self.query: + url += "?" + urllib.parse.urlencode(self.query) + + while url: + response = self.request(url) + if response.history: + redirect_url = response.url + if redirect_url == self.root + "/login": + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", redirect_url) + + for nav in text.extract_iter(response.text, + ''): + post_url = text.unescape(text.extr(nav, '