﻿// Import necessary namespaces
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using VedAstro.Library;

namespace DocToEmbeddings
{
    // Class to represent a chunk of text
    public class Chunk
    {
        public string Text { get; set; }
        public int CharacterCount => Text.Length;
        public string Embeddings { get; set; }
        public List<Chunk> ChildChunks { get; set; }

    }

    internal class Program
    {
        static void Main(string[] args)
        {
            // Initialize and extract text from a PDF file
            var pdfFilePath = @"C:\Users\ASUS\Desktop\Projects\VedAstro\Others\NotCode\Books\Muhurtha Or Electional Astrology (text only).pdf";
            string fullPdfText = ExtractTextFromPDF(pdfFilePath);

            // Convert the extracted text into a tree structure
            var chunkTreeStem = ConvertTextToTreeStructure(fullPdfText);

            // Pause the program for inspection
            Console.ReadLine();
        }

        // Extract text from a PDF file
        private static string ExtractTextFromPDF(string filePath)
        {
            using (var pdfReader = new PdfReader(filePath))
            {
                return string.Concat(Enumerable.Range(1, pdfReader.NumberOfPages).Select(pageNumber => PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber)));
            }
        }

        // given a large long text break it tree structure
        private static List<Chunk> ConvertTextToTreeStructure(string rawLargeText, int MAX_CHARS = 35000)
        {

            // Create an empty list of Chunk objects to store the resulting chunks
            List<Chunk> chunks = new List<Chunk>();

            // Continue to break down the text into chunks until the remaining text is less than or equal to MAX_CHARS
            while (rawLargeText.Length > MAX_CHARS)
            {
                // Initialize the chunk length to MAX_CHARS
                int chunkLength = MAX_CHARS;

                // Find the last space index in the text up to MAX_CHARS to avoid splitting words
                int lastSpaceIndex = rawLargeText.LastIndexOf(' ', MAX_CHARS);

                // If a space is found, adjust the chunk length to the last space index
                if (lastSpaceIndex != -1)
                {
                    chunkLength = lastSpaceIndex;
                }

                // Extract the chunk text from the original text using the adjusted chunk length
                var chunkText = rawLargeText.Substring(0, chunkLength).Trim();

                // Create a new Chunk object with the extracted text and embeddings generated by the GetEmbeddingsFromAdaLLMAPI method
                var chunk = new Chunk
                {
                    // Set the Text property to the extracted chunk text
                    Text = chunkText,

                    // Set the Embeddings property to the generated embeddings
                    Embeddings = "000000"
                };

                // Recursively break down the remaining text into smaller chunks
                const int SmallestTextChunk = 1000;
                //if smaller than limit, then rest to limit to not go into loop
                var dynamicCharBreakpoint = (MAX_CHARS / 2) < SmallestTextChunk ? SmallestTextChunk : (MAX_CHARS / 2);
                chunk.ChildChunks = ConvertTextToTreeStructure(chunkText, dynamicCharBreakpoint);

                // Add the chunk to the list of chunks
                chunks.Add(chunk);

                // Update the rawLargeText variable to the remaining text after the chunk has been extracted
                rawLargeText = rawLargeText.Substring(chunkLength).Trim();
            }

            // Handle the remaining text after the while loop
            if (!string.IsNullOrEmpty(rawLargeText))
            {
                // Create a new Chunk object with the remaining text and embeddings generated by the GetEmbeddingsFromAdaLLMAPI method
                var chunk = new Chunk
                {
                    // Set the Text property to the remaining text
                    Text = rawLargeText,

                    // Set the Embeddings property to the generated embeddings
                    Embeddings = "00000"
                };

                // Add the chunk to the list of chunks
                chunks.Add(chunk);
            }

            // Return the list of chunks
            return chunks;
        }


        // Generate embeddings for a chunk of text using an external API
        private static string GetEmbeddingsFromAdaLLMAPI(string rawText)
        {
            return LLMEmbeddingManager.GetEmbeddingsForText_Ada002(rawText).Result;
        }
    }
}
