Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Alpine parsing to actually extract commit instead of the checksum #241

Merged
merged 6 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 67 additions & 46 deletions extractor/filesystem/os/apk/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,8 @@ package apk
import (
"bufio"
"context"
"errors"
"fmt"
"io"
"io/fs"
"net/textproto"
"path/filepath"
"strings"

Expand Down Expand Up @@ -129,67 +126,91 @@ func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) ([]
return inventory, err
}

// parseSingleApkRecord reads from the scanner a single record,
// returns nil, nil when scanner ends.
func parseSingleApkRecord(scanner *bufio.Scanner) (map[string]string, error) {
// There is currently 26 keys defined here (Under "Installed Database V2"):
// https://wiki.alpinelinux.org/wiki/Apk_spec
var group map[string]string = make(map[string]string, 30)

for scanner.Scan() {
line := scanner.Text()

if line != "" {
key, val, found := strings.Cut(line, ":")

if !found {
return nil, fmt.Errorf("invalid line: %q", line)
}

group[key] = val
continue
}

// check both that line is empty and we have filled out data in group
// this avoids double empty lines returning early
if line == "" && len(group) > 0 {
return group, nil
}
}

// Err() could only be non nil when Scan() returns false
another-rex marked this conversation as resolved.
Show resolved Hide resolved
return group, scanner.Err()
}

func (e Extractor) extractFromInput(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Inventory, error) {
m, err := osrelease.GetOSRelease(input.FS)
if err != nil {
log.Errorf("osrelease.ParseOsRelease(): %v", err)
}

rd := textproto.NewReader(bufio.NewReader(input.Reader))
pkgs := []*extractor.Inventory{}
scanner := bufio.NewScanner(input.Reader)
inventories := make([]*extractor.Inventory, 0)
another-rex marked this conversation as resolved.
Show resolved Hide resolved

for eof := false; !eof; {
// Return if canceled or exceeding deadline.
if err := ctx.Err(); err != nil {
return pkgs, fmt.Errorf("%s halted at %q because of context error: %v", e.Name(), input.Path, err)
return nil, fmt.Errorf("%s halted at %q because of context error: %v", e.Name(), input.Path, err)
}

h, err := rd.ReadMIMEHeader()
record, err := parseSingleApkRecord(scanner)
if err != nil {
vpasdf marked this conversation as resolved.
Show resolved Hide resolved
if errors.Is(err, io.EOF) {
// We might still have one more line of data
// so return only after it's been parsed.
eof = true
} else {
log.Errorf("Failed to extract all APK packages: %v", err)
return pkgs, nil
}
return nil, fmt.Errorf("error while parsing apk status file %q: %w", input.Path, err)
}
pkgName := h.Get("P")
version := h.Get("V")
if pkgName == "" || version == "" {
if !eof { // Expected when reaching the last line.
log.Warnf("APK package name or version is empty (name: %q, version: %q)", pkgName, version)
}
continue
}
originName := h.Get("o")
maintainer := h.Get("m")
arch := h.Get("A")
license := h.Get("L")
commit := h.Get("c")
var sourceCode *extractor.SourceCodeIdentifier
if commit != "" {
sourceCode = &extractor.SourceCodeIdentifier{
Commit: commit,
}

if len(record) == 0 {
break
}
pkgs = append(pkgs, &extractor.Inventory{
Name: pkgName,
Version: version,

var pkg = &extractor.Inventory{
Name: record["P"],
Version: record["V"],
Metadata: &Metadata{
PackageName: pkgName,
OriginName: originName,
OSID: m["ID"],
OSVersionID: m["VERSION_ID"],
Maintainer: maintainer,
Architecture: arch,
License: license,
PackageName: record["P"],
OriginName: record["o"],
Architecture: record["A"],
License: record["L"],
Maintainer: record["m"],
},
SourceCode: sourceCode,
Locations: []string{input.Path},
})
Locations: []string{input.Path},
}

if commit, ok := record["c"]; ok {
another-rex marked this conversation as resolved.
Show resolved Hide resolved
pkg.SourceCode = &extractor.SourceCodeIdentifier{
Commit: commit,
}
}

if pkg.Name == "" || pkg.Version == "" {
vpasdf marked this conversation as resolved.
Show resolved Hide resolved
log.Warnf("APK package name or version is empty (name: %q, version: %q)", pkg.Name, pkg.Version)
continue
}

inventories = append(inventories, pkg)
}
return pkgs, nil

return inventories, nil
}

func toNamespace(m *Metadata) string {
Expand Down
41 changes: 21 additions & 20 deletions extractor/filesystem/os/apk/extractor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,21 @@ func TestExtract(t *testing.T) {
path: "testdata/installed",
osrelease: alpine,
wantInventory: []*extractor.Inventory{
getInventory("testdata/installed", "alpine-baselayout", "alpine-baselayout", "3.4.3-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q1zwvKMnYs1b6ZdPTBJ0Z7D5P3jyA="),
getInventory("testdata/installed", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q1YCAH7jdO2W816b85sUh9Z8av4Cc="),
getInventory("testdata/installed", "alpine-keys", "alpine-keys", "2.4-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "MIT", "Q17Do9XvTHoWjQlRYJe7MhnKd8FTQ="),
getInventory("testdata/installed", "apk-tools", "apk-tools", "2.14.0-r0", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q1NpN9vv021bmYzHQp262233VQXes="),
getInventory("testdata/installed", "busybox", "busybox", "1.36.0-r9", "alpine", "3.18.0", "Sören Tempel <[email protected]>", "x86_64", "GPL-2.0-only", "Q13YHCZdGFFJZvgXLCNpZqvnIg/PQ="),
getInventory("testdata/installed", "busybox-binsh", "busybox", "1.36.0-r9", "alpine", "3.18.0", "Sören Tempel <[email protected]>", "x86_64", "GPL-2.0-only", "Q1uXVXJgqIa0rg2YFhdxJ/CSe4zas="),
getInventory("testdata/installed", "ca-certificates-bundle", "ca-certificates", "20230506-r0", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "MPL-2.0 AND MIT", "Q1R/SF0IZwqesh6/EOcK5l3EOrbD0="),
getInventory("testdata/installed", "libc-utils", "libc-dev", "0.7.2-r5", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "BSD-2-Clause AND BSD-3-Clause", "Q1Llna/ri8oHhlQIRsaG8SGug0ikI="),
getInventory("testdata/installed", "libcrypto3", "openssl", "3.1.0-r4", "alpine", "3.18.0", "Ariadne Conill <[email protected]>", "x86_64", "Apache-2.0", "Q1pU2jX0Nb9bzv0BLgQ/1FEelrSbg="),
getInventory("testdata/installed", "libssl3", "openssl", "3.1.0-r4", "alpine", "3.18.0", "Ariadne Conill <[email protected]>", "x86_64", "Apache-2.0", "Q1XjnaA5LrGpHjoyWOR16xY32oW38="),
getInventory("testdata/installed", "musl", "musl", "1.2.4-r0", "alpine", "3.18.0", "Timo Teräs <[email protected]>", "x86_64", "MIT", "Q153m2gOhVOa253ExsSOb33XXh32s="),
getInventory("testdata/installed", "musl-utils", "musl", "1.2.4-r0", "alpine", "3.18.0", "Timo Teräs <[email protected]>", "x86_64", "MIT AND BSD-2-Clause AND GPL-2.0-or-later", "Q16W+GrXf7HQw+k7JuY7ZAIjWrgYk="),
getInventory("testdata/installed", "scanelf", "pax-utils", "1.3.7-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q14nq9o4+uo2NaLbTVDQB3UeooC0M="),
getInventory("testdata/installed", "ssl_client", "busybox", "1.36.0-r9", "alpine", "3.18.0", "Sören Tempel <[email protected]>", "x86_64", "GPL-2.0-only", "Q11TSZ9b3e+tcUKLyjh08V4vkWJYU="),
getInventory("testdata/installed", "zlib", "zlib", "1.2.13-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "Zlib", "Q1JlboSJkrN4qkDcokr4zenpcWEXQ="),
getInventory("testdata/installed", "alpine-baselayout", "alpine-baselayout", "3.4.3-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "65502ca9379dd29d1ac4b0bf0dcf03a3dd1b324a"),
vpasdf marked this conversation as resolved.
Show resolved Hide resolved
getInventory("testdata/installed", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "65502ca9379dd29d1ac4b0bf0dcf03a3dd1b324a"),
getInventory("testdata/installed", "alpine-keys", "alpine-keys", "2.4-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "MIT", "aab68f8c9ab434a46710de8e12fb3206e2930a59"),
getInventory("testdata/installed", "apk-tools", "apk-tools", "2.14.0-r0", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "028d34f678a5386c3dc488cc3b62467c7a9d1a0b"),
getInventory("testdata/installed", "busybox", "busybox", "1.36.0-r9", "alpine", "3.18.0", "Sören Tempel <[email protected]>", "x86_64", "GPL-2.0-only", "b5c719c244319df3c72ab1f1ee994c2143cab7f0"),
getInventory("testdata/installed", "busybox-binsh", "busybox", "1.36.0-r9", "alpine", "3.18.0", "Sören Tempel <[email protected]>", "x86_64", "GPL-2.0-only", "b5c719c244319df3c72ab1f1ee994c2143cab7f0"),
getInventory("testdata/installed", "ca-certificates-bundle", "ca-certificates", "20230506-r0", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "MPL-2.0 AND MIT", "59534a02716a92a10d177a118c34066162eff4a6"),
getInventory("testdata/installed", "libc-utils", "libc-dev", "0.7.2-r5", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "BSD-2-Clause AND BSD-3-Clause", "988f183cc9d6699930c3e18ccf4a9e36010afb56"),
getInventory("testdata/installed", "libcrypto3", "openssl", "3.1.0-r4", "alpine", "3.18.0", "Ariadne Conill <[email protected]>", "x86_64", "Apache-2.0", "730b75e01c670e3dba5d6c05420b5f605edb6201"),
getInventory("testdata/installed", "libssl3", "openssl", "3.1.0-r4", "alpine", "3.18.0", "Ariadne Conill <[email protected]>", "x86_64", "Apache-2.0", "730b75e01c670e3dba5d6c05420b5f605edb6201"),
getInventory("testdata/installed", "musl", "musl", "1.2.4-r0", "alpine", "3.18.0", "Timo Teräs <[email protected]>", "x86_64", "MIT", "b0d8a9d948174e28a4aefcee4ef6be872225ccce"),
getInventory("testdata/installed", "musl-utils", "musl", "1.2.4-r0", "alpine", "3.18.0", "Timo Teräs <[email protected]>", "x86_64", "MIT AND BSD-2-Clause AND GPL-2.0-or-later", "b0d8a9d948174e28a4aefcee4ef6be872225ccce"),
getInventory("testdata/installed", "scanelf", "pax-utils", "1.3.7-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "84a227baf001b6e0208e3352b294e4d7a40e93de"),
getInventory("testdata/installed", "ssl_client", "busybox", "1.36.0-r9", "alpine", "3.18.0", "Sören Tempel <[email protected]>", "x86_64", "GPL-2.0-only", "b5c719c244319df3c72ab1f1ee994c2143cab7f0"),
getInventory("testdata/installed", "zlib", "zlib", "1.2.13-r1", "alpine", "3.18.0", "Natanael Copa <[email protected]>", "x86_64", "Zlib", "84a227baf001b6e0208e3352b294e4d7a40e93de"),
},
wantResultMetric: stats.FileExtractedResultSuccess,
},
Expand All @@ -182,16 +182,17 @@ func TestExtract(t *testing.T) {
{
name: "invalid",
path: "testdata/invalid",
wantInventory: []*extractor.Inventory{},
wantResultMetric: stats.FileExtractedResultSuccess,
wantInventory: nil,
wantErr: cmpopts.AnyError,
wantResultMetric: stats.FileExtractedResultErrorUnknown,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a reason invalid apk installed files still gets successfully extracted?

This does change the behavior to make this extractor fail on invalid files.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In some cases like NPM extraction we opted to skip invalid formats and not return an error, e.g. https://github.com/google/osv-scalibr/blob/main/extractor/filesystem/language/javascript/packagejson/extractor.go#L177

The reasoning was that it's not the extractor that's the problem but the file, so the scan shouldn't be marked "failed" just because there was an invalid file somewhere.

Copy link
Collaborator Author

@another-rex another-rex Oct 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be handled at the scanner level? The extraction still failed, it should be up to the caller to determine what to do in those cases. We can add a "ErrInvalidFile" error to be returned that can be handled by the caller?

This is something we would want to configure in osv-scanner, where if there is an invalid file we generally want to notify the user that we didn't fully complete the scan and let them correct the file. (Though I also do see how having a log entry also kind of accomplishes this, it would be nice to have more control on the caller side)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Historically the idea was that the plugin is the best place to determine if this is maybe not the intended file or the file itself is invalid.

Example for a file which FileRequired says yes, but Extract should ignore and not through an error:

  • For go binaries FileRequired returns true based on some heuristics, ignoring the file content to be efficient. Extract then parses the file and might figure it's not a go binary. Thus this plugin should just ignore it.
  • package.json is used by multiple applications (e.g. vscode extensions). FileRequired just checks the filename. Extract can then determine if this is actually a javascript package or not.

Example for invalid file, which should error:

  • DPKG location is /var/lib/dpkg/status. If the parser finds an error, it's very likely because the file is corrupted. It's unlikely that there is any other app using that path for something else.

So in case of APK, which also has a quite unique location (lib/apk/db/installed), I'd say an invalid file should fail, yes.

},
{
name: "osrelease openwrt",
path: "testdata/single",
osrelease: `ID=openwrt
VERSION_ID=1.2.3`,
wantInventory: []*extractor.Inventory{
getInventory("testdata/single", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "openwrt", "1.2.3", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q1YCAH7jdO2W816b85sUh9Z8av4Cc="),
getInventory("testdata/single", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "openwrt", "1.2.3", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "65502ca9379dd29d1ac4b0bf0dcf03a3dd1b324a"),
},
wantResultMetric: stats.FileExtractedResultSuccess,
},
Expand All @@ -200,7 +201,7 @@ func TestExtract(t *testing.T) {
path: "testdata/single",
osrelease: "ID=openwrt",
wantInventory: []*extractor.Inventory{
getInventory("testdata/single", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "openwrt", "", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q1YCAH7jdO2W816b85sUh9Z8av4Cc="),
getInventory("testdata/single", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "openwrt", "", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "65502ca9379dd29d1ac4b0bf0dcf03a3dd1b324a"),
},
wantResultMetric: stats.FileExtractedResultSuccess,
},
Expand All @@ -209,7 +210,7 @@ func TestExtract(t *testing.T) {
path: "testdata/single",
osrelease: "",
wantInventory: []*extractor.Inventory{
getInventory("testdata/single", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "", "", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "Q1YCAH7jdO2W816b85sUh9Z8av4Cc="),
getInventory("testdata/single", "alpine-baselayout-data", "alpine-baselayout", "3.4.3-r1", "", "", "Natanael Copa <[email protected]>", "x86_64", "GPL-2.0-only", "65502ca9379dd29d1ac4b0bf0dcf03a3dd1b324a"),
},
wantResultMetric: stats.FileExtractedResultSuccess,
},
Expand Down
Loading