From a757dc346439a9f78bbfe0cd15648bcc866f7b1d Mon Sep 17 00:00:00 2001
From: Almir Kazazic <kazazic@gmail.com>
Date: Wed, 13 Dec 2023 13:27:03 +0100
Subject: [PATCH] initall attempt

just a firs stab changing TextExtractionHandler
---
 .../Core/Handlers/TextExtractionHandler.cs    | 39 ++++++++++---------
 service/Core/MemoryDocument.cs                | 33 ++++++++++++++++
 service/Core/MemoryDocumentExtensions.cs      | 14 +++++++
 service/Core/MemoryDocumentPage.cs            |  9 +++++
 4 files changed, 76 insertions(+), 19 deletions(-)
 create mode 100644 service/Core/MemoryDocument.cs
 create mode 100644 service/Core/MemoryDocumentExtensions.cs
 create mode 100644 service/Core/MemoryDocumentPage.cs

diff --git a/service/Core/Handlers/TextExtractionHandler.cs b/service/Core/Handlers/TextExtractionHandler.cs
index 6aca7446b..b73c7aa12 100644
--- a/service/Core/Handlers/TextExtractionHandler.cs
+++ b/service/Core/Handlers/TextExtractionHandler.cs
@@ -67,13 +67,14 @@ public TextExtractionHandler(
             var destFile = $"{uploadedFile.Name}.extract.txt";
             BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, sourceFile, cancellationToken).ConfigureAwait(false);
 
-            string text = string.Empty;
+            //string text = string.Empty;
+            var memoryDocument = new MemoryDocument();
             string extractType = MimeTypes.PlainText;
             bool skipFile = false;
 
             if (fileContent.ToArray().Length > 0)
             {
-                (text, extractType, skipFile) = await this.ExtractTextAsync(uploadedFile, fileContent, cancellationToken).ConfigureAwait(false);
+                (memoryDocument, extractType, skipFile) = await this.ExtractTextAsync(uploadedFile, fileContent, cancellationToken).ConfigureAwait(false);
             }
 
             // If the handler cannot extract text, we move on. There might be other handlers in the pipeline
@@ -83,14 +84,14 @@ public TextExtractionHandler(
             if (!skipFile)
             {
                 this._log.LogDebug("Saving extracted text file {0}", destFile);
-                await this._orchestrator.WriteFileAsync(pipeline, destFile, new BinaryData(text), cancellationToken).ConfigureAwait(false);
+                await this._orchestrator.WriteFileAsync(pipeline, destFile, new BinaryData(memoryDocument), cancellationToken).ConfigureAwait(false);
 
                 var destFileDetails = new DataPipeline.GeneratedFileDetails
                 {
                     Id = Guid.NewGuid().ToString("N"),
                     ParentId = uploadedFile.Id,
                     Name = destFile,
-                    Size = text.Length,
+                    Size = memoryDocument.CompleteContent.Length,
                     MimeType = extractType,
                     ArtifactType = DataPipeline.ArtifactTypes.ExtractedText,
                     Tags = pipeline.Tags,
@@ -106,54 +107,53 @@ public TextExtractionHandler(
         return (true, pipeline);
     }
 
-    private async Task<(string text, string extractType, bool skipFile)> ExtractTextAsync(
-        DataPipeline.FileDetails uploadedFile,
+    private async Task<(MemoryDocument text, string extractType, bool skipFile)> ExtractTextAsync(DataPipeline.FileDetails uploadedFile,
         BinaryData fileContent,
         CancellationToken cancellationToken)
     {
         bool skipFile = false;
-        string text = string.Empty;
+        var memoryDocument = new MemoryDocument();
         string extractType = MimeTypes.PlainText;
 
         switch (uploadedFile.MimeType)
         {
             case MimeTypes.PlainText:
                 this._log.LogDebug("Extracting text from plain text file {0}", uploadedFile.Name);
-                text = fileContent.ToString();
+                memoryDocument = fileContent.ToString().ToDocument();
                 break;
 
             case MimeTypes.MarkDown:
                 this._log.LogDebug("Extracting text from MarkDown file {0}", uploadedFile.Name);
-                text = fileContent.ToString();
+                memoryDocument = fileContent.ToString().ToDocument();
                 extractType = MimeTypes.MarkDown;
                 break;
 
             case MimeTypes.Json:
                 this._log.LogDebug("Extracting text from JSON file {0}", uploadedFile.Name);
-                text = fileContent.ToString();
+                memoryDocument = fileContent.ToString().ToDocument();
                 break;
 
             case MimeTypes.MsWord:
                 this._log.LogDebug("Extracting text from MS Word file {0}", uploadedFile.Name);
-                text = new MsWordDecoder().DocToText(fileContent);
+                memoryDocument = new MsWordDecoder().DocToText(fileContent).ToDocument();
                 break;
 
             case MimeTypes.MsPowerPoint:
                 this._log.LogDebug("Extracting text from MS PowerPoint file {0}", uploadedFile.Name);
-                text = new MsPowerPointDecoder().DocToText(fileContent,
+                memoryDocument = new MsPowerPointDecoder().DocToText(fileContent,
                     withSlideNumber: true,
                     withEndOfSlideMarker: false,
-                    skipHiddenSlides: true);
+                    skipHiddenSlides: true).ToDocument();
                 break;
 
             case MimeTypes.MsExcel:
                 this._log.LogDebug("Extracting text from MS Excel file {0}", uploadedFile.Name);
-                text = new MsExcelDecoder().DocToText(fileContent);
+                memoryDocument = new MsExcelDecoder().DocToText(fileContent).ToDocument();
                 break;
 
             case MimeTypes.Pdf:
                 this._log.LogDebug("Extracting text from PDF file {0}", uploadedFile.Name);
-                text = new PdfDecoder().DocToText(fileContent);
+                memoryDocument = new PdfDecoder().DocToText(fileContent).ToDocument();
                 break;
 
             case MimeTypes.WebPageUrl:
@@ -184,8 +184,8 @@ public TextExtractionHandler(
                     break;
                 }
 
-                text = result.Text;
-                this._log.LogDebug("Web page {0} downloaded, text length: {1}", url, text.Length);
+                memoryDocument = result.Text.ToDocument();
+                this._log.LogDebug("Web page {0} downloaded, text length: {1}", url, result.Text);
                 break;
 
             case "":
@@ -203,7 +203,8 @@ public TextExtractionHandler(
                     throw new NotSupportedException($"Image extraction not configured: {uploadedFile.Name}");
                 }
 
-                text = await new ImageDecoder().ImageToTextAsync(this._ocrEngine, fileContent, cancellationToken).ConfigureAwait(false);
+                var imageText = await new ImageDecoder().ImageToTextAsync(this._ocrEngine, fileContent, cancellationToken).ConfigureAwait(false);
+                memoryDocument = imageText.ToDocument();
                 break;
 
             default:
@@ -213,6 +214,6 @@ public TextExtractionHandler(
                 break;
         }
 
-        return (text, extractType, skipFile);
+        return (memoryDocument, extractType, skipFile);
     }
 }
diff --git a/service/Core/MemoryDocument.cs b/service/Core/MemoryDocument.cs
new file mode 100644
index 000000000..02a76da38
--- /dev/null
+++ b/service/Core/MemoryDocument.cs
@@ -0,0 +1,33 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Text;
+
+namespace Microsoft.KernelMemory;
+
+public class MemoryDocument
+{
+    public List<MemoryDocumentPage> Pages { get; set; } = new List<MemoryDocumentPage>();
+
+    public string CompleteContent
+    {
+        get
+        {
+            var sb = new StringBuilder();
+            foreach (MemoryDocumentPage page in this.Pages)
+            {
+                sb.Append(page.Text);
+            }
+
+            return sb.ToString();
+        }
+    }
+
+    public void AddPage(string str)
+    {
+        var page = new MemoryDocumentPage();
+        page.Text = str;
+        page.PageNumber = this.Pages.Count + 1;
+        this.Pages.Add(page);
+    }
+}
diff --git a/service/Core/MemoryDocumentExtensions.cs b/service/Core/MemoryDocumentExtensions.cs
new file mode 100644
index 000000000..9d8fc7a72
--- /dev/null
+++ b/service/Core/MemoryDocumentExtensions.cs
@@ -0,0 +1,14 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+namespace Microsoft.KernelMemory;
+
+public static class MemoryDocumentExtensions
+{
+    public static MemoryDocument ToDocument(this string str)
+    {
+        // Implement logic to create and return a Document object from the string
+        var result = new MemoryDocument();
+        result.AddPage(str);
+        return result;
+    }
+}
diff --git a/service/Core/MemoryDocumentPage.cs b/service/Core/MemoryDocumentPage.cs
new file mode 100644
index 000000000..e723674ea
--- /dev/null
+++ b/service/Core/MemoryDocumentPage.cs
@@ -0,0 +1,9 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+namespace Microsoft.KernelMemory;
+
+public class MemoryDocumentPage
+{
+    public int PageNumber { get; set; }
+    public string Text { get; set; }
+}