-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmod.ts
132 lines (120 loc) · 3.33 KB
/
mod.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import puppeteer, { Page } from "https://deno.land/x/[email protected]/mod.ts";
import { EventEmitter } from "https://deno.land/x/[email protected]/mod.ts";
import { extractLinks } from "./lib/extractions.ts";
interface Extractor {
shouldExtract: (pageUrl: string) => boolean;
}
export interface LinkExtractor extends Extractor {
linkExtraction: string;
}
export interface DetailExtractor extends Extractor {
details: {
[key: string]: string;
};
}
export interface Config {
crawl: {
startUrl: string;
linkExtractors: LinkExtractor[];
detailExtractor: DetailExtractor;
};
crawler: {
launchConfig: {
headless: boolean;
};
politeness?: number;
};
}
type Events = {
data: [Record<string, string>];
start: [string, Date];
crawled: [string];
info: [string, string];
error: [string];
finish: [string[], Date];
};
export class Crawler extends EventEmitter<Events> {
#crawl;
#crawler;
#toCrawl: Array<string> = [];
#crawled: Array<string> = [];
#page?: Page;
constructor(config: Config) {
super();
this.#crawl = config.crawl;
this.#crawler = config.crawler;
}
#addUrls(urls: string[]) {
urls.forEach((url) => {
if (this.#crawled.includes(url)) {
this.emit("info", "Duplicate url", url);
} else {
this.#toCrawl.push(url);
}
});
}
async #extractLinks() {
const page = this.#page as Page;
const allHrefs = await Promise.all(
this.#crawl.linkExtractors.map((extractor) =>
new Promise((resolve) => {
if (extractor.shouldExtract(page.url())) {
resolve(extractLinks({
page,
expression: extractor.linkExtraction,
}));
}
resolve([] as string[]);
})
),
) as string[];
this.#addUrls(allHrefs.flat());
}
async #extractDetails() {
const page = this.#page as Page;
const shouldExtract = this.#crawl.detailExtractor.shouldExtract(page.url());
if (shouldExtract) {
const details = this.#crawl.detailExtractor.details;
const extraction = await Promise.all(
Object.keys(details).map(async (key) => {
const result = await page.$(details[key]);
const detail = await result?.evaluate((div) => div.innerText);
return [key, detail];
}),
);
this.emit("data", Object.fromEntries(extraction));
}
}
async #doCrawl(url: string) {
try {
this.#crawled.push(url);
if (this.#page) {
const page = this.#page;
await page.goto(url);
await this.#extractLinks();
await this.#extractDetails();
this.emit("crawled", url);
} else {
this.emit("error", "Page was not initialised");
}
} catch (error) {
this.emit("error", error.toString());
} finally {
const nextUrl = this.#toCrawl.pop();
if (nextUrl) {
await new Promise((resolve) =>
setTimeout(resolve, this.#crawler.politeness || 1000)
);
await this.#doCrawl(nextUrl);
}
}
}
async crawl() {
this.emit("start", this.#crawl.startUrl, new Date());
const browser = await puppeteer.launch(this.#crawler.launchConfig);
this.#page = await browser.newPage();
await this.#doCrawl(this.#crawl.startUrl);
await browser.close();
this.emit("finish", this.#crawled, new Date());
}
}