forked from ejucovy/mercury-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcli.js
executable file
·103 lines (98 loc) · 2.44 KB
/
cli.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env node
/* eslint-disable */
const Mercury = require('./dist/mercury');
const argv = require('yargs-parser')(process.argv.slice(2));
const {
_: [url],
format,
f,
extend,
e,
extendList,
l,
header,
h,
addExtractor,
x,
} = argv;
(async (
urlToParse,
contentType,
extendedTypes,
extendedListTypes,
headers,
addExtractor
) => {
if (!urlToParse) {
console.log(
'\n\
mercury-parser\n\n\
The Mercury Parser extracts semantic content from any url\n\n\
Usage:\n\
\n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\
\n\
'
);
return;
}
try {
const contentTypeMap = {
html: 'html',
markdown: 'markdown',
md: 'markdown',
text: 'text',
txt: 'text',
};
const extensions = {};
[].concat(extendedTypes || []).forEach(t => {
const [name, selector] = t.split('=');
const fullSelector =
selector.indexOf('|') > 0 ? selector.split('|') : selector;
extensions[name] = { selectors: [fullSelector] };
});
[].concat(extendedListTypes || []).forEach(t => {
const [name, selector] = t.split('=');
const fullSelector =
selector.indexOf('|') > 0 ? selector.split('|') : selector;
extensions[name] = {
selectors: [fullSelector],
allowMultiple: true,
};
});
// Attempt to load custom extractor from path.
let customExtractor;
if (addExtractor) {
customExtractor = require(addExtractor);
}
const result = await Mercury.parse(urlToParse, {
contentType: contentTypeMap[contentType],
extend: extensions,
headers,
customExtractor,
});
console.log(JSON.stringify(result, null, 2));
} catch (e) {
if (e.message === 'ETIMEDOUT' && false) {
console.error(
'\nMercury Parser encountered a timeout trying to load that resource.'
);
} else {
console.error(
'\nMercury Parser encountered a problem trying to parse that resource.\n'
);
console.error(e);
}
const reportBug =
'If you believe this was an error, please file an issue at:\n\n https://github.com/postlight/mercury-parser/issues/new';
console.error(`\n${reportBug}\n`);
process.exit(1);
}
})(
url,
format || f,
extend || e,
extendList || l,
header || h,
addExtractor || x
);