From 5015b1371fbd04671ef3aa414bb4eb2a66402d26 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Fri, 25 Oct 2024 09:42:07 +0200 Subject: [PATCH] change: simpler way of handling metadata inserted by .knowledge.json (#156) --- pkg/client/common.go | 41 ++++++++++----------- pkg/client/metadata.go | 83 +++++++++++++++++++++++++----------------- 2 files changed, 70 insertions(+), 54 deletions(-) diff --git a/pkg/client/common.go b/pkg/client/common.go index 018f4e82..3b7ab37a 100644 --- a/pkg/client/common.go +++ b/pkg/client/common.go @@ -94,12 +94,13 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID } if fileInfo.IsDir() { - initialMetadata := &Metadata{Metadata: map[string]FileMetadata{}} - directoryMetadata, err := loadAndMergeMetadata(path, initialMetadata) + directoryMetadata, err := loadDirMetadata(path) if err != nil { return ingestedFilesCount, err } - metadataStack = append(metadataStack, *directoryMetadata) + if directoryMetadata != nil { + metadataStack = append(metadataStack, *directoryMetadata) + } // Process directory err = filepath.WalkDir(path, func(subPath string, d os.DirEntry, err error) error { @@ -115,12 +116,13 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID } // One dir level deeper -> load new metadata - parentMetadata := metadataStack[len(metadataStack)-1] - newMetadata, err := loadAndMergeMetadata(subPath, &parentMetadata) + newMetadata, err := loadDirMetadata(subPath) if err != nil { return err } - metadataStack = append(metadataStack, *newMetadata) + if newMetadata != nil { + metadataStack = append(metadataStack, *newMetadata) + } return nil } @@ -141,16 +143,19 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID } touchedFilePaths = append(touchedFilePaths, absPath) - currentMetadata := metadataStack[len(metadataStack)-1] - g.Go(func() error { if err := sem.Acquire(ctx, 1); err != nil { return err } defer sem.Release(1) - slog.Debug("Ingesting file", "path", absPath, "metadata", currentMetadata) - err = ingestionFunc(sp, currentMetadata.Metadata[filepath.Base(sp)]) // FIXME: metadata + fileMeta, err := findMetadata(absPath, metadataStack) + if err != nil { + return fmt.Errorf("failed to find metadata for %s: %w", absPath, err) + } + slog.Debug("Ingesting file", "absPath", absPath, "metadata", fileMeta) + + err = ingestionFunc(sp, fileMeta) if err == nil { ingestedFilesCount++ } @@ -161,8 +166,6 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID if err != nil { return ingestedFilesCount, err } - // Directory processed, pop metadata - metadataStack = metadataStack[:len(metadataStack)-1] } else { if isIgnored(ignore, path) { slog.Debug("Ignoring file", "path", path, "ignorefile", opts.IgnoreFile, "ignoreExtensions", opts.IgnoreExtensions) @@ -181,16 +184,12 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID } defer sem.Release(1) - var fileMetadata FileMetadata - if len(metadataStack) > 0 { - currentMetadata := metadataStack[len(metadataStack)-1] - fileMetadata = currentMetadata.Metadata[filepath.Base(path)] - } - err = ingestionFunc(path, fileMetadata) - if err == nil { - ingestedFilesCount++ + ingestedFilesCount++ + fileMeta, err := findMetadata(absPath, metadataStack) + if err != nil { + return fmt.Errorf("failed to find metadata for %s: %w", absPath, err) } - return err + return ingestionFunc(path, fileMeta) }) } diff --git a/pkg/client/metadata.go b/pkg/client/metadata.go index 4ee55137..488a348e 100644 --- a/pkg/client/metadata.go +++ b/pkg/client/metadata.go @@ -3,6 +3,7 @@ package client import ( "encoding/json" "fmt" + "log/slog" "os" "path/filepath" "strings" @@ -11,51 +12,67 @@ import ( const MetadataFilename = ".knowledge.json" type Metadata struct { - Metadata map[string]FileMetadata `json:"metadata"` // Map of file paths to metadata + MetadataFileAbsPath string + Metadata map[string]FileMetadata `json:"metadata"` // Map of file paths to metadata // TODO (idea): add other fields like description here, so we can hierarchically build a dataset description? Challenge is pruning and merging. } type FileMetadata map[string]any -func loadAndMergeMetadata(dirPath string, parentMetadata *Metadata) (*Metadata, error) { +// loadAndMergeMetadata checks if the given directory contains a metadata file. +// If so, it reads it in and merges it with the previous level of metadata. +// Doing so, the parentMetadata is trimmed down to only the entries relevant to this directory. +func loadDirMetadata(dirPath string) (*Metadata, error) { metadataPath := filepath.Join(dirPath, MetadataFilename) - dirName := filepath.Base(dirPath) - if _, err := os.Stat(metadataPath); err == nil { // Metadata file exists - fileContent, err := os.ReadFile(metadataPath) - if err != nil { - return nil, fmt.Errorf("failed to read metadata file %s: %w", metadataPath, err) - } + metaAbsPath, err := filepath.Abs(metadataPath) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for %s: %w", metadataPath, err) + } + dirPath = filepath.Dir(metadataPath) + if _, err := os.Stat(metadataPath); err != nil { + return nil, nil + } + // Metadata file exists + fileContent, err := os.ReadFile(metadataPath) + if err != nil { + return nil, fmt.Errorf("failed to read metadata file %s: %w", metadataPath, err) + } - var newMetadata Metadata - if err := json.Unmarshal(fileContent, &newMetadata); err != nil { - return nil, fmt.Errorf("failed to unmarshal metadata file %s: %w", metadataPath, err) - } + metadata := &Metadata{ + MetadataFileAbsPath: metaAbsPath, + } + if err := json.Unmarshal(fileContent, &metadata); err != nil { + return nil, fmt.Errorf("failed to unmarshal metadata file %s: %w", metadataPath, err) + } - // Merge with parent metadata, overriding existing keys - mergedMetadata := &Metadata{Metadata: make(map[string]FileMetadata, len(parentMetadata.Metadata)+len(newMetadata.Metadata))} - for filename, fileMetadata := range parentMetadata.Metadata { - if !strings.HasPrefix(filename, dirName) { - // skip entries which are not meant for this (sub-)directory - continue - } - fname := strings.TrimPrefix(strings.TrimPrefix(filename, dirName), string(filepath.Separator)) - mergedMetadata.Metadata[fname] = fileMetadata - } + slog.Info("Loaded metadata", "path", metadataPath, "metadata", metadata.Metadata) + + return metadata, nil - if newMetadata.Metadata != nil { - for filename, fileMetadata := range newMetadata.Metadata { - for k, v := range fileMetadata { - if mergedMetadata.Metadata[filename] == nil { - mergedMetadata.Metadata[filename] = make(FileMetadata, len(fileMetadata)) - } - mergedMetadata.Metadata[filename][k] = v - } +} + +func findMetadata(path string, metadataStack []Metadata) (FileMetadata, error) { + + absPath, err := filepath.Abs(path) + if err != nil { + return nil, err + } + + metadata := make(map[string]any) + + for _, metadataEntry := range metadataStack { + target := strings.TrimPrefix(strings.TrimPrefix(absPath, filepath.Dir(metadataEntry.MetadataFileAbsPath)), string(filepath.Separator)) + + if m, ok := metadataEntry.Metadata[target]; ok { + for k, v := range m { + metadata[k] = v } } - return mergedMetadata, nil } - // No metadata file, return parent metadata as is - return parentMetadata, nil + slog.Debug("Found metadata", "path", path, "metadata", metadata) + + return metadata, nil + }