-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlinkCheck.ts
185 lines (153 loc) · 5.24 KB
/
linkCheck.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import { DOMParser, parseHTML } from "linkedom";
// mozilla domain parsing library
import * as psl from "psl";
interface linkData {
displayText: string;
url: string;
}
function parseLinkTag(atag: string): HTMLElement {
// const htmlDocument: HTMLDocument = new DOMParser().parseFromString(
// atag,
// "text/html"
// );
const { document } = parseHTML(atag);
// console.log({ document });
const htmlDocument: HTMLDocument = document;
// get the actual element
const htmlEl: HTMLElement = htmlDocument.getElementsByTagName("a")[0];
// console.log({ htmlEl });
return htmlEl;
}
/*
Extracts displayed text and actual url
*/
export function extractLinkData(atag: string): linkData {
const htmlEl = parseLinkTag(atag.trim());
// get inner content (display text)
const displayText = htmlEl.textContent.trim().replace(/ /g, "");
// console.log({ displayText });
let href = "";
// get href (url)
try {
href = htmlEl.getAttribute("href");
} catch {
if (!href) {
console.error("a tag has no href attribute");
href = null;
throw new Error("formatError");
}
}
// console.log({ href });
return { displayText: displayText, url: href };
}
// extract hostname without protocol (http: ...) or path
// maybe regex could do this faster
function extractDomainName(url: string): string {
let urlParsed: URL;
try {
urlParsed = new URL(url);
} catch (err) {
console.info("string can't be parsed to url", err);
throw new Error("string can't be parsed to url: " + err);
}
const hostname = urlParsed.hostname;
// console.log({ hostname })
const parsedUrl = psl.parse(hostname);
const domain = parsedUrl.domain;
// console.log({ domain })
if (!domain) {
console.error("domain can't be parsed");
throw new Error("domain can't be parsed");
}
// console.log({ domain });
return domain;
}
/*
Checks whether input losely appears to look like a link
Used to check whether the <a> displayed text </a> is similar to a link
Why Loose check?
Because phishers could add small errors the user doesn't notice
but which might fool the system
*/
function isURLlike(string: string): boolean {
// this needs to check also text that only appears like a link, like first transform escaped characrters, trim the input and delete whitespaces
let state = false;
// regex from https://regexr.com/2rj36
// eslint-disable-next-line
const looseURLCheck =
/[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
const looseURLRegex = new RegExp(looseURLCheck);
// cleanup
string = string.trim().replace(" ", "");
if (string.match(looseURLRegex)) {
// console.log(string + " looks like a Link");
state = true;
} else {
// console.log(string + " doesn't look like a Link");
state = false;
}
return state;
}
function normalizeDisplayTextURL(displayTextURL: string): string {
// NOTE: in order to be parsed by new URL(), url needs a protocol (http://)
// However some displayText links might not have one. Therefore we add it.
// contains a url network protocol ?
const regexUrlProtocol = /^(?<proto>\w+):/g;
const found = displayTextURL.match(regexUrlProtocol);
// console.log(found);
if (!found) {
// no protocol found, for example www.google.com or google.com
// add generic http:// protocol for that new URL() can parse it
displayTextURL = "http://" + displayTextURL;
}
return displayTextURL;
}
function getDomainNamefromDisplayedLink(displayText: string): string | null {
if (isURLlike(displayText) === false) {
// if the displayText doesn't look like a link
return null;
}
try {
// problem is if displayText has no http in front ... add it automatically? What if only text?
// if looks like link, add http, otherwise ignore/return null.
// add protocol.
const normalizedDisplayTextURL = normalizeDisplayTextURL(displayText);
// we try ro parse it into a url
let displayTextURL = extractDomainName(normalizedDisplayTextURL);
// console.log({ displayTextURL })
return displayTextURL;
} catch (err) {
// if it doesn't work, we don't have to worry since it's not a valid url
console.info("couldn't parse displayText into valid url", err);
return null;
}
}
// check whether link is missleading
export function isFakeLink(linkData: linkData): boolean {
if (!linkData.url) {
console.error("no url present to check");
throw new Error("no url provided");
}
const urlExtracted = linkData.url;
const linkURLdomainName = extractDomainName(urlExtracted);
const displayText = linkData.displayText;
// console.log("extract", displayText);
const displayTextURL = getDomainNamefromDisplayedLink(displayText);
if (!displayTextURL) {
// console.log({displayTextURL});
console.info("displayTextURL doesn't contain a link");
return false;
}
console.log({ linkURLdomainName });
console.log({ displayTextURL });
if (displayTextURL == linkURLdomainName) {
// console.log("good");
return false;
}
if (displayTextURL != linkURLdomainName) {
// console.log("bad");
return true;
}
// compare displayed with actual link
// let regex = "^(((?!-))(xn--|_{1,1})?[a-z0-9-]{0,61}[a-z0-9]{1,1}\.)*(xn--)?([a-z0-9][a-z0-9\-]{0,60}|[a-z0-9-]{1,30}\.[a-z]{2,})$"
}