From a757dc346439a9f78bbfe0cd15648bcc866f7b1d Mon Sep 17 00:00:00 2001 From: Almir Kazazic Date: Wed, 13 Dec 2023 13:27:03 +0100 Subject: [PATCH] initall attempt just a firs stab changing TextExtractionHandler --- .../Core/Handlers/TextExtractionHandler.cs | 39 ++++++++++--------- service/Core/MemoryDocument.cs | 33 ++++++++++++++++ service/Core/MemoryDocumentExtensions.cs | 14 +++++++ service/Core/MemoryDocumentPage.cs | 9 +++++ 4 files changed, 76 insertions(+), 19 deletions(-) create mode 100644 service/Core/MemoryDocument.cs create mode 100644 service/Core/MemoryDocumentExtensions.cs create mode 100644 service/Core/MemoryDocumentPage.cs diff --git a/service/Core/Handlers/TextExtractionHandler.cs b/service/Core/Handlers/TextExtractionHandler.cs index 6aca7446b..b73c7aa12 100644 --- a/service/Core/Handlers/TextExtractionHandler.cs +++ b/service/Core/Handlers/TextExtractionHandler.cs @@ -67,13 +67,14 @@ public TextExtractionHandler( var destFile = $"{uploadedFile.Name}.extract.txt"; BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, sourceFile, cancellationToken).ConfigureAwait(false); - string text = string.Empty; + //string text = string.Empty; + var memoryDocument = new MemoryDocument(); string extractType = MimeTypes.PlainText; bool skipFile = false; if (fileContent.ToArray().Length > 0) { - (text, extractType, skipFile) = await this.ExtractTextAsync(uploadedFile, fileContent, cancellationToken).ConfigureAwait(false); + (memoryDocument, extractType, skipFile) = await this.ExtractTextAsync(uploadedFile, fileContent, cancellationToken).ConfigureAwait(false); } // If the handler cannot extract text, we move on. There might be other handlers in the pipeline @@ -83,14 +84,14 @@ public TextExtractionHandler( if (!skipFile) { this._log.LogDebug("Saving extracted text file {0}", destFile); - await this._orchestrator.WriteFileAsync(pipeline, destFile, new BinaryData(text), cancellationToken).ConfigureAwait(false); + await this._orchestrator.WriteFileAsync(pipeline, destFile, new BinaryData(memoryDocument), cancellationToken).ConfigureAwait(false); var destFileDetails = new DataPipeline.GeneratedFileDetails { Id = Guid.NewGuid().ToString("N"), ParentId = uploadedFile.Id, Name = destFile, - Size = text.Length, + Size = memoryDocument.CompleteContent.Length, MimeType = extractType, ArtifactType = DataPipeline.ArtifactTypes.ExtractedText, Tags = pipeline.Tags, @@ -106,54 +107,53 @@ public TextExtractionHandler( return (true, pipeline); } - private async Task<(string text, string extractType, bool skipFile)> ExtractTextAsync( - DataPipeline.FileDetails uploadedFile, + private async Task<(MemoryDocument text, string extractType, bool skipFile)> ExtractTextAsync(DataPipeline.FileDetails uploadedFile, BinaryData fileContent, CancellationToken cancellationToken) { bool skipFile = false; - string text = string.Empty; + var memoryDocument = new MemoryDocument(); string extractType = MimeTypes.PlainText; switch (uploadedFile.MimeType) { case MimeTypes.PlainText: this._log.LogDebug("Extracting text from plain text file {0}", uploadedFile.Name); - text = fileContent.ToString(); + memoryDocument = fileContent.ToString().ToDocument(); break; case MimeTypes.MarkDown: this._log.LogDebug("Extracting text from MarkDown file {0}", uploadedFile.Name); - text = fileContent.ToString(); + memoryDocument = fileContent.ToString().ToDocument(); extractType = MimeTypes.MarkDown; break; case MimeTypes.Json: this._log.LogDebug("Extracting text from JSON file {0}", uploadedFile.Name); - text = fileContent.ToString(); + memoryDocument = fileContent.ToString().ToDocument(); break; case MimeTypes.MsWord: this._log.LogDebug("Extracting text from MS Word file {0}", uploadedFile.Name); - text = new MsWordDecoder().DocToText(fileContent); + memoryDocument = new MsWordDecoder().DocToText(fileContent).ToDocument(); break; case MimeTypes.MsPowerPoint: this._log.LogDebug("Extracting text from MS PowerPoint file {0}", uploadedFile.Name); - text = new MsPowerPointDecoder().DocToText(fileContent, + memoryDocument = new MsPowerPointDecoder().DocToText(fileContent, withSlideNumber: true, withEndOfSlideMarker: false, - skipHiddenSlides: true); + skipHiddenSlides: true).ToDocument(); break; case MimeTypes.MsExcel: this._log.LogDebug("Extracting text from MS Excel file {0}", uploadedFile.Name); - text = new MsExcelDecoder().DocToText(fileContent); + memoryDocument = new MsExcelDecoder().DocToText(fileContent).ToDocument(); break; case MimeTypes.Pdf: this._log.LogDebug("Extracting text from PDF file {0}", uploadedFile.Name); - text = new PdfDecoder().DocToText(fileContent); + memoryDocument = new PdfDecoder().DocToText(fileContent).ToDocument(); break; case MimeTypes.WebPageUrl: @@ -184,8 +184,8 @@ public TextExtractionHandler( break; } - text = result.Text; - this._log.LogDebug("Web page {0} downloaded, text length: {1}", url, text.Length); + memoryDocument = result.Text.ToDocument(); + this._log.LogDebug("Web page {0} downloaded, text length: {1}", url, result.Text); break; case "": @@ -203,7 +203,8 @@ public TextExtractionHandler( throw new NotSupportedException($"Image extraction not configured: {uploadedFile.Name}"); } - text = await new ImageDecoder().ImageToTextAsync(this._ocrEngine, fileContent, cancellationToken).ConfigureAwait(false); + var imageText = await new ImageDecoder().ImageToTextAsync(this._ocrEngine, fileContent, cancellationToken).ConfigureAwait(false); + memoryDocument = imageText.ToDocument(); break; default: @@ -213,6 +214,6 @@ public TextExtractionHandler( break; } - return (text, extractType, skipFile); + return (memoryDocument, extractType, skipFile); } } diff --git a/service/Core/MemoryDocument.cs b/service/Core/MemoryDocument.cs new file mode 100644 index 000000000..02a76da38 --- /dev/null +++ b/service/Core/MemoryDocument.cs @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.KernelMemory; + +public class MemoryDocument +{ + public List Pages { get; set; } = new List(); + + public string CompleteContent + { + get + { + var sb = new StringBuilder(); + foreach (MemoryDocumentPage page in this.Pages) + { + sb.Append(page.Text); + } + + return sb.ToString(); + } + } + + public void AddPage(string str) + { + var page = new MemoryDocumentPage(); + page.Text = str; + page.PageNumber = this.Pages.Count + 1; + this.Pages.Add(page); + } +} diff --git a/service/Core/MemoryDocumentExtensions.cs b/service/Core/MemoryDocumentExtensions.cs new file mode 100644 index 000000000..9d8fc7a72 --- /dev/null +++ b/service/Core/MemoryDocumentExtensions.cs @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.KernelMemory; + +public static class MemoryDocumentExtensions +{ + public static MemoryDocument ToDocument(this string str) + { + // Implement logic to create and return a Document object from the string + var result = new MemoryDocument(); + result.AddPage(str); + return result; + } +} diff --git a/service/Core/MemoryDocumentPage.cs b/service/Core/MemoryDocumentPage.cs new file mode 100644 index 000000000..e723674ea --- /dev/null +++ b/service/Core/MemoryDocumentPage.cs @@ -0,0 +1,9 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.KernelMemory; + +public class MemoryDocumentPage +{ + public int PageNumber { get; set; } + public string Text { get; set; } +}