-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
6 changed files
with
343 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,6 +124,7 @@ | |
"poipiku", | ||
"pornhub", | ||
"pornpics", | ||
"postmill", | ||
"pururin", | ||
"reactor", | ||
"readcomiconline", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License version 2 as | ||
# published by the Free Software Foundation. | ||
|
||
"""Extractors for Postmill instances""" | ||
|
||
import re | ||
from .common import BaseExtractor, Message | ||
from .. import text, exception | ||
|
||
|
||
class PostmillExtractor(BaseExtractor): | ||
"""Base class for Postmill extractors""" | ||
basecategory = "postmill" | ||
directory_fmt = ("{category}", "{instance}", "{forum}") | ||
filename_fmt = "{id}_{title[:220]}.{extension}" | ||
archive_fmt = "{filename}" | ||
|
||
def _init(self): | ||
self.instance = self.root.partition("://")[2] | ||
self.save_link_post_body = self.config("save-link-post-body", False) | ||
self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search | ||
self._search_image_tag = re.compile( | ||
r'<a href="[^"]+"\n +class="submission__image-link"').search | ||
|
||
def items(self): | ||
for post_url in self.post_urls(): | ||
page = self.request(post_url).text | ||
extr = text.extract_from(page) | ||
|
||
title = text.unescape(extr( | ||
'<meta property="og:title" content="', '">')) | ||
date = text.parse_datetime(extr( | ||
'<meta property="og:article:published_time" content="', '">')) | ||
username = extr( | ||
'<meta property="og:article:author" content="', '">') | ||
post_canonical_url = text.unescape(extr( | ||
'<link rel="canonical" href="', '">')) | ||
|
||
url = text.unescape(extr( | ||
'<h1 class="submission__title unheaderize inline"><a href="', | ||
'"')) | ||
body = extr( | ||
'<div class="submission__body break-text text-flow">', | ||
'</div>') | ||
|
||
match = self._search_canonical_url(post_canonical_url) | ||
forum = match.group(1) | ||
id = int(match.group(2)) | ||
|
||
is_text_post = url.startswith("/") | ||
is_image_post = self._search_image_tag(page) is not None | ||
data = { | ||
"title": title, | ||
"date": date, | ||
"username": username, | ||
"forum": forum, | ||
"id": id, | ||
"flair": [text.unescape(i) for i in text.extract_iter( | ||
page, '<span class="flair__label">', '</span>')], | ||
"instance": self.instance, | ||
} | ||
|
||
urls = [] | ||
if is_text_post or self.save_link_post_body: | ||
urls.append((Message.Url, "text:" + body)) | ||
|
||
if is_image_post: | ||
urls.append((Message.Url, url)) | ||
elif not is_text_post: | ||
urls.append((Message.Queue, url)) | ||
|
||
data["count"] = len(urls) | ||
yield Message.Directory, data | ||
for data["num"], (msg, url) in enumerate(urls, 1): | ||
if url.startswith("text:"): | ||
data["filename"], data["extension"] = "", "htm" | ||
else: | ||
data = text.nameext_from_url(url, data) | ||
|
||
yield msg, url, data | ||
|
||
|
||
class PostmillSubmissionsExtractor(PostmillExtractor): | ||
"""Base class for Postmill submissions extractors""" | ||
whitelisted_parameters = () | ||
|
||
def __init__(self, match): | ||
PostmillExtractor.__init__(self, match) | ||
groups = match.groups() | ||
self.base = groups[-3] | ||
self.sorting_path = groups[-2] or "" | ||
self.query = {key: value for key, value in text.parse_query( | ||
groups[-1]).items() if self.acceptable_query(key)} | ||
|
||
def items(self): | ||
url = self.root + self.base + self.sorting_path | ||
|
||
while url: | ||
response = self.request(url, params=self.query) | ||
if response.history: | ||
redirect_url = response.url | ||
if redirect_url == self.root + "/login": | ||
raise exception.StopExtraction( | ||
"HTTP redirect to login page (%s)", redirect_url) | ||
page = response.text | ||
|
||
for nav in text.extract_iter(page, | ||
'<nav class="submission__nav">', | ||
'</nav>'): | ||
post_url = text.unescape(text.extr(nav, '<a href="', '"')) | ||
yield Message.Queue, text.urljoin(url, post_url), \ | ||
{"_extractor": PostmillPostExtractor} | ||
|
||
url = text.unescape(text.extr(page, | ||
'<link rel="next" href="', '">')) | ||
|
||
def acceptable_query(self, key): | ||
return key in self.whitelisted_parameters or key == "t" or \ | ||
(key.startswith("next[") and key.endswith("]")) | ||
|
||
|
||
BASE_PATTERN = PostmillExtractor.update({ | ||
"raddle": { | ||
"root" : None, | ||
"pattern": (r"(?:raddle\.me|" | ||
r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid" | ||
r"\.onion)"), | ||
} | ||
}) | ||
QUERY_RE = r"(?:\?([^#]+))?$" | ||
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \ | ||
QUERY_RE | ||
|
||
|
||
class PostmillPostExtractor(PostmillExtractor): | ||
"""Extractor for a single submission URL""" | ||
subcategory = "post" | ||
pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)" | ||
example = "https://raddle.me/f/FORUM/123/TITLE" | ||
|
||
def __init__(self, match): | ||
PostmillExtractor.__init__(self, match) | ||
self.forum = match.group(3) | ||
self.post_id = match.group(4) | ||
|
||
def post_urls(self): | ||
return (self.root + "/f/" + self.forum + "/" + self.post_id,) | ||
|
||
|
||
class PostmillShortURLExtractor(PostmillExtractor): | ||
"""Extractor for short submission URLs""" | ||
subcategory = "shorturl" | ||
pattern = BASE_PATTERN + r"/(\d+)$" | ||
example = "https://raddle.me/123" | ||
|
||
def __init__(self, match): | ||
PostmillExtractor.__init__(self, match) | ||
self.post_id = match.group(3) | ||
|
||
def items(self): | ||
url = self.root + "/" + self.post_id | ||
response = self.request(url, method="HEAD", allow_redirects=False) | ||
full_url = text.urljoin(url, response.headers["Location"]) | ||
yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor} | ||
|
||
|
||
class PostmillHomeExtractor(PostmillSubmissionsExtractor): | ||
"""Extractor for the home page""" | ||
subcategory = "home" | ||
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE | ||
example = "https://raddle.me/" | ||
|
||
|
||
class PostmillForumExtractor(PostmillSubmissionsExtractor): | ||
"""Extractor for submissions on a forum""" | ||
subcategory = "forum" | ||
pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE | ||
example = "https://raddle.me/f/FORUM" | ||
|
||
|
||
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): | ||
"""Extractor for submissions made by a user""" | ||
subcategory = "usersubmissions" | ||
pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE | ||
example = "https://raddle.me/user/USER/submissions" | ||
|
||
|
||
class PostmillTagExtractor(PostmillSubmissionsExtractor): | ||
"""Extractor for submissions on a forum with a specific tag""" | ||
subcategory = "tag" | ||
pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE | ||
example = "https://raddle.me/tag/TAG" | ||
|
||
|
||
class PostmillSearchExtractor(PostmillSubmissionsExtractor): | ||
"""Extractor for search results""" | ||
subcategory = "search" | ||
pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$" | ||
example = "https://raddle.me/search?q=QUERY" | ||
whitelisted_parameters = ("q",) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License version 2 as | ||
# published by the Free Software Foundation. | ||
|
||
from gallery_dl.extractor import postmill | ||
|
||
|
||
__tests__ = ( | ||
{ | ||
"#url" : "https://raddle.me/", | ||
"#category": ("postmill", "raddle.me", "home"), | ||
"#class" : postmill.PostmillHomeExtractor, | ||
"#range" : "1-25", | ||
"#count" : 25, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/traa", | ||
"#category": ("postmill", "raddle.me", "forum"), | ||
"#class" : postmill.PostmillForumExtractor, | ||
"#count" : 1, | ||
"#pattern" : "^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/user/Sam_the_enby/submissions", | ||
"#category": ("postmill", "raddle.me", "usersubmissions"), | ||
"#class" : postmill.PostmillUserSubmissionsExtractor, | ||
"#range" : "1-25", | ||
"#count" : 25, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/tag/Trans", | ||
"#category": ("postmill", "raddle.me", "tag"), | ||
"#class" : postmill.PostmillTagExtractor, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/search?q=tw", | ||
"#category": ("postmill", "raddle.me", "search"), | ||
"#class" : postmill.PostmillSearchExtractor, | ||
"#range" : "1-50", | ||
"#count" : 50, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/160845", | ||
"#category": ("postmill", "raddle.me", "shorturl"), | ||
"#class" : postmill.PostmillShortURLExtractor, | ||
"#pattern" : r"^https://raddle\.me/f/egg_irl/160845/egg_irl$", | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/NonBinary/179017/scattered-thoughts-would-appreciate-advice-immensely-tw", | ||
"#comment" : "Text post", | ||
"#category": ("postmill", "raddle.me", "post"), | ||
"#class" : postmill.PostmillPostExtractor, | ||
"#sha1_url" : "99277f815820810d9d7e219d455f818601858378", | ||
"#sha1_content": "7a1159e1e45f2ce8e2c8b5959f6d66b042776f3b", | ||
"#count" : 1, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/egg_irl/160845", | ||
"#comment" : "Image post", | ||
"#category": ("postmill", "raddle.me", "post"), | ||
"#class" : postmill.PostmillPostExtractor, | ||
"#sha1_url" : "48663f767ea258fcd545ab5aa0e734f98f434388", | ||
"#sha1_content": "431e938082c2b59c44888a83cfc711cd1f0e910a", | ||
"#count" : 1, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/trans/177042/tw-vent-nsfw-suicide-i-lost-no-nut-november-tw-trauma", | ||
"#comment" : "Image + text post (with text enabled)", | ||
"#category": ("postmill", "raddle.me", "post"), | ||
"#class" : postmill.PostmillPostExtractor, | ||
"#options" : {"save-link-post-body": True}, | ||
"#pattern" : r"^(text:[\s\S]+|https://raddle\.me/submission_images/[0-9a-f]+\.png)$", | ||
"#count" : 2, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/videos/179541/raisins-and-sprite", | ||
"#comment" : "Link post", | ||
"#category": ("postmill", "raddle.me", "post"), | ||
"#class" : postmill.PostmillPostExtractor, | ||
"#urls" : "https://m.youtube.com/watch?v=RFJCA5zcZxI", | ||
"#count" : 1, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/Anime/150698/neo-tokyo-1987-link-to-the-english-dub-version-last-link", | ||
"#comment" : "Link + text post (with text disabled)", | ||
"#category": ("postmill", "raddle.me", "post"), | ||
"#class" : postmill.PostmillPostExtractor, | ||
"#pattern" : "^https://fantasyanime\.com/anime/neo-tokyo-dub$", | ||
"#count" : 1, | ||
}, | ||
|
||
{ | ||
"#url" : "https://raddle.me/f/egg_irl/166855/4th-wall-breaking-please-let-this-be-a-flair-egg-irl", | ||
"#comment" : "Post with multiple flairs", | ||
"#category": ("postmill", "raddle.me", "post"), | ||
"#class" : postmill.PostmillPostExtractor, | ||
"flair" : ["Gender non-specific", "4th wall breaking"], | ||
}, | ||
|
||
) |