<?php
// +-------------------------------------------------+
// e 2002-2004 PMB Services / www.sigb.net pmb@sigb.net et contributeurs (voir www.sigb.net)
// +-------------------------------------------------+
// $Id: TextChunker.php,v 1.1.2.5.2.1 2026/02/06 09:34:16 qvarin Exp $

namespace Pmb\AI\Library\Chunker;

if (stristr($_SERVER['REQUEST_URI'], '/'.basename(__FILE__))) {
    die("no access");
}

class TextChunker implements Chunker
{
    /**
     * Le nombre maximum de mots par chunk
     *
     * @var int
     */
    public const MAX_WORDS = 500;

    /**
     * Le nombre de mots d'overlap entre les chunks
     *
     * @var int
     */
    public const OVERLAP_WORDS = 50;

    /**
     * Ratio moyen tokens/mot pour Mistral (1 mot environ 1.6 tokens)
     *
     * @var float
     */
    public const TOKEN_RATIO = 1.6;

    /**
     * Limite maximum de tokens par chunk pour Mistral
     *
     * @var int
     */
    public const MAX_TOKENS = 4000;

    /**
     * Chunk par limite de tokens (utilise le ratio TOKEN_RATIO pour convertir)
     *
     * @param string $text
     * @param int $maxTokens Nombre maximum de tokens par chunk
     * @param int $overlapTokens Nombre de tokens d'overlap entre les chunks
     * @return array<int, array{offset: int, length: int, data: string, estimated_tokens: int}>
     */
    public function chunkByTokens(string $text, int $maxTokens = self::MAX_TOKENS, int $overlapTokens = 80): array
    {
        // Convertir les limites de tokens en limites de mots
        $maxWords = (int) floor($maxTokens / self::TOKEN_RATIO);
        $overlapWords = (int) floor($overlapTokens / self::TOKEN_RATIO);

        $chunks = $this->chunk($text, $maxWords, $overlapWords);

        // Ajouter l'estimation de tokens a chaque chunk
        foreach ($chunks as &$chunk) {
            $chunk['estimated_tokens'] = $this->estimateTokens($chunk['data']);
        }

        return $chunks;
    }

    /**
     * Chunk
     *
     * @param string $text
     * @param int $maxWords Nombre maximum de mots par chunk
     * @param int $overlapWords Nombre de mots d'overlap entre les chunks
     * @return array<int, array{offset: int, length: int, data: string}>
     */
    public function chunk(string $text, int $maxWords = self::MAX_WORDS, int $overlapWords = self::OVERLAP_WORDS): array
    {
        // etape 1 : Decouper d'abord sur les sauts de ligne
        $lines = preg_split('/\n+/', $text, -1, PREG_SPLIT_NO_EMPTY);

        // etape 2 : Pour chaque ligne, decouper sur la ponctuation si presente
        $sentences = [];
        foreach ($lines as $line) {
            $line = trim($line);
            if (empty($line)) {
                continue;
            }

            // Essayer de decouper sur la ponctuation
            preg_match_all('/[^.?!]+[.?!]+[\'")\]]*/', $line, $matches);
            if (!empty($matches[0])) {
                foreach ($matches[0] as $sentence) {
                    $sentences[] = trim($sentence);
                }
                // Capturer aussi le texte residuel apres la derniere ponctuation
                $captured = implode('', $matches[0]);
                $residual = trim(str_replace($captured, '', $line));
                if (!empty($residual)) {
                    $sentences[] = $residual;
                }
            } else {
                // Pas de ponctuation, garder la ligne entiere
                $sentences[] = $line;
            }
        }

        if (empty($sentences)) {
            return [];
        }

        $result = [];
        $currentChunk = '';
        $currentWordCount = 0;
        $chunkStartOffset = 0;
        $currentOffset = 0;

        foreach ($sentences as $sentence) {
            $sentence = trim($sentence);
            if (empty($sentence)) {
                continue;
            }

            $sentenceWordCount = $this->countWords($sentence);

            // Si la phrase elle-meme depasse la limite, on la decoupe en morceaux
            if ($sentenceWordCount > $maxWords) {
                $subParts = $this->splitLongText($sentence, $maxWords);
                foreach ($subParts as $subPart) {
                    $subPartWordCount = $this->countWords($subPart);
                    $subPartLen = strlen($subPart);

                    if (!empty($currentChunk) && $currentWordCount + $subPartWordCount > $maxWords) {
                        $result[] = [
                            'offset' => $chunkStartOffset,
                            'length' => strlen($currentChunk),
                            'data' => $currentChunk
                        ];
                        $currentChunk = $subPart;
                        $currentWordCount = $subPartWordCount;
                        $chunkStartOffset = $currentOffset;
                    } else {
                        if (empty($currentChunk)) {
                            $currentChunk = $subPart;
                            $currentWordCount = $subPartWordCount;
                            $chunkStartOffset = $currentOffset;
                        } else {
                            $currentChunk .= ' ' . $subPart;
                            $currentWordCount += $subPartWordCount;
                        }
                    }
                    $currentOffset += $subPartLen + 1;
                }
                continue;
            }

            $sentenceLen = strlen($sentence);

            // Si ajouter cette phrase depasse la limite de mots, on ferme le chunk
            if (!empty($currentChunk) && $currentWordCount + $sentenceWordCount > $maxWords) {
                $result[] = [
                    'offset' => $chunkStartOffset,
                    'length' => strlen($currentChunk),
                    'data' => $currentChunk
                ];

                // Overlap : on reprend la fin du chunk precedent
                if ($overlapWords > 0 && $currentWordCount > $overlapWords) {
                    $overlapText = $this->getOverlapFromEnd($currentChunk, $overlapWords, true);
                    $overlapWordCount = $this->countWords($overlapText);
                    $currentChunk = $overlapText . ' ' . $sentence;
                    $currentWordCount = $overlapWordCount + $sentenceWordCount;
                    $chunkStartOffset = $currentOffset - strlen($overlapText);
                } else {
                    $currentChunk = $sentence;
                    $currentWordCount = $sentenceWordCount;
                    $chunkStartOffset = $currentOffset;
                }
            } else {
                if (empty($currentChunk)) {
                    $currentChunk = $sentence;
                    $currentWordCount = $sentenceWordCount;
                    $chunkStartOffset = $currentOffset;
                } else {
                    $currentChunk .= ' ' . $sentence;
                    $currentWordCount += $sentenceWordCount;
                }
            }

            $currentOffset += $sentenceLen + 1;
        }

        // Dernier chunk
        if (!empty(trim($currentChunk))) {
            $result[] = [
                'offset' => $chunkStartOffset,
                'length' => strlen($currentChunk),
                'data' => trim($currentChunk)
            ];
        }

        return $result;
    }

    /**
     * Compte le nombre de mots dans un texte
     *
     * @param string $text
     * @return int
     */
    private function countWords(string $text): int
    {
        // Supprime les espaces multiples et compte les mots
        $text = preg_replace('/\s+/', ' ', trim($text));
        if (empty($text)) {
            return 0;
        }
        return count(explode(' ', $text));
    }

    /**
     * Decoupe un texte trop long en morceaux de $maxWords mots
     *
     * @param string $text
     * @param int $maxWords
     * @return array<string>
     */
    private function splitLongText(string $text, int $maxWords): array
    {
        $text = preg_replace('/\s+/', ' ', trim($text));
        $words = explode(' ', $text);
        $parts = [];

        while (count($words) > 0) {
            $chunk = array_splice($words, 0, $maxWords);
            $parts[] = implode(' ', $chunk);
        }

        return $parts;
    }

    /**
     * Estime le nombre de tokens dans un texte (base sur le ratio TOKEN_RATIO)
     *
     * @param string $text
     * @return int
     */
    public function estimateTokens(string $text): int
    {
        return (int) ceil($this->countWords($text) * self::TOKEN_RATIO);
    }

    /**
     * Calcule le nombre maximum de mots pour respecter une limite de tokens
     *
     * @param int $maxTokens
     * @return int
     */
    public function maxWordsForTokenLimit(int $maxTokens = self::MAX_TOKENS): int
    {
        return (int) floor($maxTokens / self::TOKEN_RATIO);
    }

    /**
     * Recupere le texte d'overlap depuis la fin
     *
     * @param string $text
     * @param int $targetLength Longueur cible (en caracteres ou en mots selon $isWordCount)
     * @param bool $isWordCount Si true, compte en mots, sinon en caracteres
     * @return string
     */
    private function getOverlapFromEnd(string $text, int $targetLength, bool $isWordCount = false): string
    {
        // Coupe sur les phrases si possible
        preg_match_all('/[^.?!]+[.?!]+/', $text, $matches);
        $sentences = $matches[0] ?? [];

        if (empty($sentences)) {
            if ($isWordCount) {
                $words = explode(' ', trim($text));
                $words = array_slice($words, -$targetLength);
                return implode(' ', $words);
            }
            return substr($text, -$targetLength);
        }

        $overlap = '';
        for ($i = count($sentences) - 1; $i >= 0; $i--) {
            $candidate = trim($sentences[$i]) . (empty($overlap) ? '' : ' ' . $overlap);

            if ($isWordCount) {
                if ($this->countWords($candidate) > $targetLength) {
                    break;
                }
            } else {
                if (strlen($candidate) > $targetLength) {
                    break;
                }
            }
            $overlap = $candidate;
        }

        if (empty($overlap)) {
            if ($isWordCount) {
                $words = explode(' ', trim($text));
                $words = array_slice($words, -$targetLength);
                return implode(' ', $words);
            }
            return substr($text, -$targetLength);
        }

        return $overlap;
    }
}
