Skip to content
This repository was archived by the owner on Oct 30, 2024. It is now read-only.

Commit 8a7aea3

Browse files
authored
change: by default, do not exit on a failing file ingestion, but make sure we log the failed status (#154)
1 parent c91d365 commit 8a7aea3

File tree

5 files changed

+21
-8
lines changed

5 files changed

+21
-8
lines changed

pkg/client/client.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ type IngestPathsOpts struct {
2323
IsDuplicateFuncName string
2424
Prune bool // Prune deleted files
2525
ErrOnUnsupportedFile bool
26+
ExitOnFailedFile bool
2627
}
2728

2829
type Client interface {

pkg/client/common.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,12 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
149149
}
150150
defer sem.Release(1)
151151

152-
ingestedFilesCount++
153152
slog.Debug("Ingesting file", "path", absPath, "metadata", currentMetadata)
154-
return ingestionFunc(sp, currentMetadata.Metadata[filepath.Base(sp)]) // FIXME: metadata
153+
err = ingestionFunc(sp, currentMetadata.Metadata[filepath.Base(sp)]) // FIXME: metadata
154+
if err == nil {
155+
ingestedFilesCount++
156+
}
157+
return err
155158
})
156159
return nil
157160
})
@@ -178,13 +181,16 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID
178181
}
179182
defer sem.Release(1)
180183

181-
ingestedFilesCount++
182184
var fileMetadata FileMetadata
183185
if len(metadataStack) > 0 {
184186
currentMetadata := metadataStack[len(metadataStack)-1]
185187
fileMetadata = currentMetadata.Metadata[filepath.Base(path)]
186188
}
187-
return ingestionFunc(path, fileMetadata)
189+
err = ingestionFunc(path, fileMetadata)
190+
if err == nil {
191+
ingestedFilesCount++
192+
}
193+
return err
188194
})
189195
}
190196

pkg/client/standalone.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,11 @@ func (c *StandaloneClient) ListDatasets(ctx context.Context) ([]types.Dataset, e
6767
}
6868

6969
func (c *StandaloneClient) Ingest(ctx context.Context, datasetID string, name string, data []byte, opts datastore.IngestOpts) ([]string, error) {
70-
return c.Datastore.Ingest(ctx, datasetID, name, data, opts)
70+
ids, err := c.Datastore.Ingest(ctx, datasetID, name, data, opts)
71+
if err != nil {
72+
log.FromCtx(ctx).With("status", "failed").With("error", err.Error()).Error("Ingest failed")
73+
}
74+
return ids, err
7175
}
7276

7377
func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, opts *IngestPathsOpts, paths ...string) (int, error) {
@@ -111,7 +115,7 @@ func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, op
111115
iopts.IngestionFlows = opts.IngestionFlows
112116
}
113117

114-
_, err = c.Ingest(log.ToCtx(ctx, log.FromCtx(ctx).With("filepath", path)), datasetID, filename, file, iopts)
118+
_, err = c.Ingest(log.ToCtx(ctx, log.FromCtx(ctx).With("filepath", path).With("absolute_path", iopts.FileMetadata.AbsolutePath)), datasetID, filename, file, iopts)
115119

116120
if err != nil && !opts.ErrOnUnsupportedFile && errors.Is(err, &documentloader.UnsupportedFileTypeError{}) {
117121
err = nil

pkg/cmd/ingest.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ type ClientIngestOpts struct {
3535
NoCreateDataset bool `usage:"Do NOT create the dataset if it doesn't exist" default:"true" env:"KNOW_INGEST_NO_CREATE_DATASET"`
3636
DeduplicationFuncName string `usage:"Name of the deduplication function to use" name:"dedupe-func" env:"KNOW_INGEST_DEDUPE_FUNC"`
3737
ErrOnUnsupportedFile bool `usage:"Error on unsupported file types" default:"false" env:"KNOW_INGEST_ERR_ON_UNSUPPORTED_FILE"`
38+
ExitOnFailedFile bool `usage:"Exit directly on failed file" default:"false" env:"KNOW_INGEST_EXIT_ON_FAILED_FILE"`
3839
}
3940

4041
func (s *ClientIngest) Customize(cmd *cobra.Command) {
@@ -80,6 +81,7 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
8081
IsDuplicateFuncName: s.DeduplicationFuncName,
8182
Prune: s.Prune,
8283
ErrOnUnsupportedFile: s.ErrOnUnsupportedFile,
84+
ExitOnFailedFile: s.ExitOnFailedFile,
8385
}
8486

8587
if s.FlowsFile != "" {
@@ -119,7 +121,7 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error {
119121

120122
filesIngested, err := c.IngestPaths(ctx, datasetID, ingestOpts, filePath)
121123
if err != nil {
122-
return err
124+
return fmt.Errorf("ingested %d files but encountered at least one error: %w", filesIngested, err)
123125
}
124126

125127
fmt.Printf("Ingested %d files from %q into dataset %q (took: %s)\n", filesIngested, filePath, datasetID, time.Since(startTime))

pkg/datastore/ingest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, name string, c
191191
startTime := time.Now()
192192
docIDs, err := s.Vectorstore.AddDocuments(ctx, docs, datasetID)
193193
if err != nil {
194-
statusLog.With("component", "vectorstore").Error("Failed to add documents", "error", err)
194+
statusLog.With("component", "vectorstore").With("status", "failed").With("error", err.Error()).Error("Failed to add documents")
195195
return nil, fmt.Errorf("failed to add documents from file %q: %w", opts.FileMetadata.AbsolutePath, err)
196196
}
197197
statusLog.Debug("Added documents to vectorstore", "duration", time.Since(startTime))

0 commit comments

Comments
 (0)