Skip to content

Commit

Permalink
Merge pull request #481 from DataDog/bryce.thuilot/sint-2270
Browse files Browse the repository at this point in the history
feat: add typosquatting analyzer for go modules
  • Loading branch information
bthuilot authored Nov 13, 2024
2 parents 1517038 + 3453290 commit dd87387
Show file tree
Hide file tree
Showing 8 changed files with 3,111 additions and 37 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ Source code heuristics:
| **Heuristic** | **Description** |
|:-------------:|:---------------:|
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
| typosquatting | Identify packages that are named closely to an highly popular package |

<!-- END_RULE_LIST -->

## Custom Rules
Expand Down
7 changes: 6 additions & 1 deletion guarddog/analyzer/metadata/go/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from typing import Type

from guarddog.analyzer.metadata import Detector
from guarddog.analyzer.metadata.go.typosquatting import GoTyposquatDetector

GO_METADATA_RULES = {}

classes: list[Detector] = []
classes: list[Type[Detector]] = [
GoTyposquatDetector,
]

for detectorClass in classes:
detectorInstance = detectorClass() # type: ignore
Expand Down
118 changes: 118 additions & 0 deletions guarddog/analyzer/metadata/go/typosquatting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import json
import os
from typing import Optional

from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION


class GoTyposquatDetector(TyposquatDetector):
"""Detector for typosquatting attacks for go modules. Checks for distance one Levenshtein,
one-off character swaps, permutations around hyphens, and substrings.
Attributes:
popular_packages (set): set of top 500 most popular Go packages,
as determined by count of references across top starred repositories
"""

def _get_top_packages(self) -> set:
top_packages_filename = "top_go_packages.json"

resources_dir = TOP_PACKAGES_CACHE_LOCATION
if resources_dir is None:
resources_dir = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "resources")
)

top_packages_path = os.path.join(resources_dir, top_packages_filename)

top_packages_information = None

if top_packages_filename in os.listdir(resources_dir):
with open(top_packages_path, "r") as top_packages_file:
top_packages_information = json.load(top_packages_file)

if top_packages_information is None:
raise Exception(
f"Could not retrieve top Go packages from {top_packages_path}")

return set(top_packages_information)

def detect(
self,
package_info,
path: Optional[str] = None,
name: Optional[str] = None,
version: Optional[str] = None,
) -> tuple[bool, Optional[str]]:
"""
Uses a Go package's name to determine the
package is attempting a typosquatting attack
Args:
name (str): The name of the package,
also known as the import path
Returns:
Tuple[bool, Optional[str]]: True if package is typosquatted,
along with a message indicating the similar package name.
False if not typosquatted and None
"""

similar_package_names = self.get_typosquatted_package(name)
if len(similar_package_names) > 0:
return True, TyposquatDetector.MESSAGE_TEMPLATE % ", ".join(
similar_package_names
)
return False, None

def _get_confused_forms(self, package_name) -> list:
"""
Gets confused terms for Go packages
Confused terms are:
- golang to go swaps (or vice versa)
- the removal of go/golang terms
- gitlab.com to github.com swaps (or vice versa)
Args:
package_name (str): name of the package
Returns:
list: list of confused terms
"""

confused_forms = []

if package_name.startswith("github.com/"):
replaced = package_name.replace("github.com/", "gitlab.com/", 1)
confused_forms.append(replaced)
elif package_name.startswith("gitlab.com/"):
replaced = package_name.replace("gitlab.com/", "github.com/", 1)
confused_forms.append(replaced)

terms = package_name.split("-")

# Detect swaps like golang-package -> go-package
for i in range(len(terms)):
confused_term = None

if "golang" in terms[i]:
confused_term = terms[i].replace("golang", "go")
elif "go" in terms[i]:
confused_term = terms[i].replace("go", "golang")
else:
continue

# Get form when replacing or removing go/golang term
replaced_form = terms[:i] + [confused_term] + terms[i + 1:]
removed_form = terms[:i] + terms[i + 1:]

for form in (replaced_form, removed_form):
confused_forms.append("-".join(form))

return confused_forms


if __name__ == "__main__":
# update top_npm_packages.json
GoTyposquatDetector()._get_top_packages()
6 changes: 6 additions & 0 deletions guarddog/analyzer/metadata/npm/typosquatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ def detect(
)
return False, None

def _get_confused_forms(self, package_name) -> list:
""" Gets confused terms for npm packages.
Currently, there are no confused terms for npm packages.
"""
return []


if __name__ == "__main__":
# update top_npm_packages.json
Expand Down
38 changes: 38 additions & 0 deletions guarddog/analyzer/metadata/pypi/typosquatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,44 @@ def detect(self, package_info, path: Optional[str] = None, name: Optional[str] =
return True, TyposquatDetector.MESSAGE_TEMPLATE % ", ".join(similar_package_names)
return False, None

def _get_confused_forms(self, package_name) -> list:
"""
Gets confused terms for python packages
Confused terms are:
- py to python swaps (or vice versa)
- the removal of py/python terms
Args:
package_name (str): name of the package
Returns:
list: list of confused terms
"""

confused_forms = []

terms = package_name.split("-")

# Detect swaps like python-package -> py-package
for i in range(len(terms)):
confused_term = None

if "python" in terms[i]:
confused_term = terms[i].replace("python", "py")
elif "py" in terms[i]:
confused_term = terms[i].replace("py", "python")
else:
continue

# Get form when replacing or removing py/python term
replaced_form = terms[:i] + [confused_term] + terms[i + 1:]
removed_form = terms[:i] + terms[i + 1:]

for form in (replaced_form, removed_form):
confused_forms.append("-".join(form))

return confused_forms


if __name__ == "__main__":
# update top_pypi_packages.json
Expand Down
Loading

0 comments on commit dd87387

Please sign in to comment.