-
Notifications
You must be signed in to change notification settings - Fork 322
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
744 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
/** | ||
* 批量下載 podcasts 小说 的工具。 Download Apple Podcast. | ||
*/ | ||
|
||
'use strict'; | ||
|
||
require('../work_crawler_loader.js'); | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
CeL.run([ 'application.storage.EPUB' | ||
// CeL.detect_HTML_language() | ||
, 'application.locale' ]); | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
var crawler = new CeL.work_crawler({ | ||
// auto_create_ebook, automatic create ebook | ||
// MUST includes CeL.application.locale! | ||
need_create_ebook : true, | ||
// recheck:從頭檢測所有作品之所有章節與所有圖片。不會重新擷取圖片。對漫畫應該僅在偶爾需要從頭檢查時開啟此選項。default:false | ||
// recheck='changed': 若是已變更,例如有新的章節,則重新下載/檢查所有章節內容。否則只會自上次下載過的章節接續下載。 | ||
// recheck : 'changed', | ||
|
||
// search_work_interval : '2s', | ||
// chapter_time_interval : '2s', | ||
|
||
site_name : 'Podcast', | ||
|
||
base_URL : 'https://podcasts.apple.com/', | ||
|
||
// 解析 作品名稱 → 作品id get_work() | ||
search_URL : function(key) { | ||
return 'https://www.apple.com/tw/search/' + key + '?src=globalnav'; | ||
}, | ||
parse_search_result : function(html, get_label) { | ||
html = html.between(' id="explore"'); | ||
// console.log(html); | ||
var id_data = [], | ||
// {Array}id_list = [id,id,...] | ||
id_list = []; | ||
|
||
html.each_between('<div class="rf-serp-product-description">', null, | ||
// | ||
function(text) { | ||
var matched = text.match(/\/podcast\/([^\/"]+)\/id(\d+)"/); | ||
if (!matched) | ||
return; | ||
var title_id = decodeURIComponent(matched[1]); | ||
id_list.push(title_id + '-' + matched[2]); | ||
var title = get_label(text.between( | ||
'<h2 class="rf-serp-productname">', '</h2>')); | ||
id_data.push(title); | ||
if (false && title_id !== title) { | ||
CeL.error('parse_search_result: Different title! ' | ||
+ JSON.stringify(title) + ', ' | ||
+ JSON.stringify(title_id)); | ||
} | ||
}); | ||
// console.log([ id_list, id_data ]); | ||
return [ id_list, id_data ]; | ||
}, | ||
|
||
// 取得作品的章節資料。 get_work_data() | ||
work_URL : function(work_id) { | ||
var matched = work_id.match(/^(.+)-(\d+)$/); | ||
return 'tw/podcast/' + matched[1] + '/id' + matched[2]; | ||
}, | ||
parse_work_data : function(html, get_label, extract_work_data) { | ||
// console.trace(html); | ||
var work_data = { | ||
// 必要屬性:須配合網站平台更改。 | ||
/** | ||
* <code> | ||
<h2>最仙遊<span>文 / <a href="/fxnlist/虾写.html">虾写</a></span></h2> | ||
</code> | ||
*/ | ||
title : get_label(html.between( | ||
'<span class="product-header__title"', '</span>').between( | ||
'>')) | ||
|
||
// 選擇性屬性:須配合網站平台更改。 | ||
}; | ||
|
||
// console.trace(text); | ||
Object.assign(work_data, JSON.parse(html.between( | ||
// | ||
'<script name="schema:podcast-show" type="application/ld+json">', | ||
'</script>'))); | ||
// e.g., | ||
// https://podcasts.apple.com/tw/podcast/%E4%B8%8B%E4%B8%80%E6%9C%AC%E8%AE%80%E4%BB%80%E9%BA%BC/id1532820533 | ||
work_data.title = work_data.title.replace(/[]/g, ''); | ||
|
||
// 由 meta data 取得作品資訊。 | ||
// extract_work_data(work_data, html); | ||
|
||
// console.log(html); | ||
// console.log(work_data); | ||
return work_data; | ||
}, | ||
|
||
get_chapter_list : function(work_data, html, get_label) { | ||
// <div class="catalog" id="catalog"> | ||
// <h3>目录</h3> | ||
|
||
var data = html.between(' id="shoebox-media-api-cache-amp-podcasts">', | ||
'</script>'); | ||
data = JSON.parse(data); | ||
data = data[Object.keys(data)[0]]; | ||
data = JSON.parse(data); | ||
data = data.d; | ||
data = data[0]; | ||
data = data.relationships.episodes.data; | ||
// console.trace(data, Object.keys(data)); | ||
|
||
data.forEach(function(chapter_data) { | ||
chapter_data.title = chapter_data.attributes.name; | ||
chapter_data.url = chapter_data.attributes.url; | ||
}); | ||
// reset work_data.chapter_list | ||
work_data.chapter_list = data; | ||
// console.log(work_data.chapter_list); | ||
}, | ||
|
||
pre_parse_chapter_data | ||
// 執行在解析章節資料 process_chapter_data() 之前的作業 (async)。 | ||
// 必須自行保證執行 callback(),不丟出異常、中斷。 | ||
: function(XMLHttp, work_data, callback, chapter_NO) { | ||
var chapter_data = work_data.chapter_list[chapter_NO - 1]; | ||
// console.trace(chapter_data); | ||
|
||
var directory = work_data.directory + 'media' + CeL.env.path_separator, | ||
// | ||
title = chapter_data.title, | ||
// | ||
url = decodeURI(chapter_data.attributes.assetUrl), | ||
// | ||
extension = url.match(/(\.[^.?]+)(?:\?.*)?$/)[1], | ||
// | ||
file_name = directory | ||
// + chapter_NO.pad(work_data.chapter_NO_pad_digits || 4) + ' ' | ||
+ CeL.to_file_name(title) + extension; | ||
// console.trace({directory,title,url,extension}); | ||
CeL.create_directory(directory); | ||
|
||
var matched = url.match(/https%3A%2F%2F[^?]+/); | ||
if (matched) | ||
url = decodeURIComponent(matched[0]); | ||
|
||
CeL.log_temporary('Fetching [' + file_name + '] (' + url + ')...'); | ||
// CeL.set_debug(9); | ||
CeL.get_URL_cache(url, function(data, error) { | ||
callback(); | ||
}, { | ||
file_name : file_name, | ||
encoding : undefined, | ||
get_URL_options : Object.assign({ | ||
error_retry : this.MAX_ERROR_RETRY | ||
}, this.get_URL_options, { | ||
// 有些檔案比較大,必須花費比較多時間。 | ||
timeout : 5 * 60 * 1000 | ||
}) | ||
}); | ||
}, | ||
|
||
// 取得每一個章節的各個影像內容資料。 get_chapter_data() | ||
parse_chapter_data : function(html, work_data, get_label, chapter_NO) { | ||
// console.log(html); | ||
|
||
var chapter_data = work_data.chapter_list[chapter_NO - 1]; | ||
var text = chapter_data.attributes.description.standard; | ||
|
||
this.add_ebook_chapter(work_data, chapter_NO, { | ||
title : chapter_data.title, | ||
date : html.between('<p class="post-byline">', '<').trim().replace( | ||
/^\d*$/, ''), | ||
text : text | ||
}); | ||
} | ||
}); | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
// CeL.set_debug(3); | ||
|
||
start_crawler(crawler, typeof module === 'object' && module); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
/** | ||
* 批量下載 比奇中文网 小说 的工具。 Download biqizw novels. | ||
*/ | ||
|
||
'use strict'; | ||
|
||
require('../work_crawler_loader.js'); | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
CeL.run('application.net.work_crawler.sites.PTCMS'); | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
var crawler = CeL.PTCMS({ | ||
// 2024/3/9 前: https://www.biqizw.com/ | ||
base_URL : 'https://www.biqizw.com/', | ||
charset : 'gbk', | ||
|
||
// chapter_time_interval : '6s', | ||
|
||
// 解析 作品名稱 → 作品id get_work() | ||
search_URL : 'modules/article/search.php?searchkey=', | ||
parse_search_result : 'biquge', | ||
search_work_interval : '30s', | ||
|
||
chapter_time_interval : '1s', | ||
|
||
// 取得作品的章節資料。 get_work_data() | ||
work_URL : function(work_id) { | ||
return (work_id / 1000 | 0) + '_' + work_id + '/'; | ||
}, | ||
// 取得包含章節列表的文字範圍。 | ||
get_chapter_list_contents : function(html) { | ||
return html.between('<div id="list">', '</div>'); | ||
}, | ||
// 去掉前後網站廣告。 | ||
remove_ads : function remove_ads(text) { | ||
// 去掉前後網站廣告。 | ||
text = text.replace( | ||
/** | ||
* <code> | ||
// https://www.biqizw.com/3_3733/3167227.html 第39章“师兄,承让了!” | ||
比奇中文网 www.biqizw.com,最快更新长生:开局一条命,修为全靠苟 !<br><br> | ||
</code> | ||
*/ | ||
/[^<>]+中文网\s*[\w.]+,最快更新[^<>]+/, '') | ||
|
||
.replace( | ||
/** | ||
* <code> | ||
// https://www.biqizw.com/3_3733/3167227.html | ||
</code> | ||
*/ | ||
/无尽的昏迷过后,时宇猛地从床上起身。想要看最新章节内容,请下载星星阅读app,[\s\S]+比奇中文/, '') | ||
|
||
.replace( | ||
/** | ||
* <code> | ||
// https://www.biqizw.com/3_3733/3167227.html | ||
<br /><br /> <a href="<a href="http://www.biqizw.com"" target="_blank">http://www.biqizw.com"</a> target="_blank"><a href="http://www.biqizw.com</a>" target="_blank">www.biqizw.com</a></a> 比奇中文 | ||
</code> | ||
*/ | ||
/<a href=[\s\S]+?(<br[^<>]*>|$)/, '$1') | ||
|
||
.replace( | ||
/** | ||
* <code> | ||
// https://www.biqizw.com/3_3733/3167190.html 长生:开局一条命,修为全靠苟 第2章 宗门里的摸鱼日常 | ||
学不到。Μ.<br /><br /> | ||
// https://www.biqizw.com/3_3733/3167191.html 长生:开局一条命,修为全靠苟 第3章 猪肉铺的姑娘 | ||
说了话。【1】 【6】 【6】 【小】 【说】<br /><br /> | ||
// https://www.biqizw.com/3_3733/3167192.html 长生:开局一条命,修为全靠苟 第4章 不试一试,怎么知道不行呢? | ||
宗门的安危。”ωWW.<br /><br /> | ||
// https://www.biqizw.com/3_3733/3167194.html 长生:开局一条命,修为全靠苟 第6章 人生若只如初见 | ||
熟悉的面容。大风小说<br /><br /> | ||
// https://www.biqizw.com/3_3733/3167200.html 长生:开局一条命,修为全靠苟 第12章 苟着也能惹祸上身? | ||
撂倒了……166小说<br /><br /> | ||
// https://www.biqizw.com/3_3733/3167531.html 长生:开局一条命,修为全靠苟 第341章 百年计划 | ||
166小说 无尽的昏迷过后,时宇猛地从床上起身。 | ||
</code> | ||
*/ | ||
/(?:ωWW\.|166小说|大风小说|(?<=\W)Μ\.|【1】 【6】 【6】 【小】 【说】)(<br[^<>]*>|\s*$)/g | ||
// | ||
, '$1') | ||
|
||
.replace( | ||
/** | ||
* <code> | ||
// https://www.biqizw.com/3_3733/3167227.html 长生:开局一条命,修为全靠苟 第39章“师兄,承让了!” | ||
以压倒性的优势取得了胜利。水印广告测试 水印广告测试<br /><br /> | ||
时宇猛地从床上起身。想要看最新章节内容,请下载星星阅读app,无广告免费阅读最新章节内容。网站已经不更新最新章节内容,已经星星阅读小说APP更新最新章节内容。<br /><br /> | ||
这不是他!下载星星阅读app,阅读最新章节内容无广告免费<br /><br /> | ||
</code> | ||
*/ | ||
/(?:水印广告测试|想要看最新章节内容|下载星星阅读)[^<>]*?(<br[^<>]*>)/, '$1') | ||
|
||
; | ||
|
||
// console.log(text); | ||
return text; | ||
} | ||
}); | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
// CeL.set_debug(3); | ||
|
||
start_crawler(crawler, typeof module === 'object' && module); |
Oops, something went wrong.