diff --git a/examples/store/document-splitting.php b/examples/store/document-splitting.php new file mode 100644 index 00000000..43029695 --- /dev/null +++ b/examples/store/document-splitting.php @@ -0,0 +1,16 @@ +loadEnv(dirname(__DIR__, 2).'/.env'); + +if (empty($_ENV['OPENAI_API_KEY'])) { + echo 'Please set the OPENAI_API_KEY environment variable.'.\PHP_EOL; + exit(1); +} + +$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); +$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE); + +$textDocuments = [ + new TextDocument(Uuid::v4(), 'Hello World'), + new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'), + new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'), +]; + +$vectorizer = new Vectorizer($platform, $embeddings); +$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments); + +dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments)); diff --git a/examples/store/mariadb-similarity-search-gemini.php b/examples/store/mariadb-similarity-search-gemini.php index 3c29374f..25df7003 100644 --- a/examples/store/mariadb-similarity-search-gemini.php +++ b/examples/store/mariadb-similarity-search-gemini.php @@ -15,6 +15,7 @@ use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store; use PhpLlm\LlmChain\Store\Document\Metadata; use PhpLlm\LlmChain\Store\Document\TextDocument; +use PhpLlm\LlmChain\Store\Document\Vectorizer; use PhpLlm\LlmChain\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -57,7 +58,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']); $embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]); -$indexer = new Indexer($platform, $embeddings, $store); +$vectorizer = new Vectorizer($platform, $embeddings); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); $model = new Gemini(Gemini::GEMINI_2_FLASH_LITE); diff --git a/examples/store/mariadb-similarity-search.php b/examples/store/mariadb-similarity-search.php index 65332d59..b7e2cc93 100644 --- a/examples/store/mariadb-similarity-search.php +++ b/examples/store/mariadb-similarity-search.php @@ -14,6 +14,7 @@ use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store; use PhpLlm\LlmChain\Store\Document\Metadata; use PhpLlm\LlmChain\Store\Document\TextDocument; +use PhpLlm\LlmChain\Store\Document\Vectorizer; use PhpLlm\LlmChain\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -55,7 +56,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); -$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings()); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); $model = new GPT(GPT::GPT_4O_MINI); diff --git a/examples/store/mongodb-similarity-search.php b/examples/store/mongodb-similarity-search.php index c04cc30b..ed009e5e 100644 --- a/examples/store/mongodb-similarity-search.php +++ b/examples/store/mongodb-similarity-search.php @@ -13,6 +13,7 @@ use PhpLlm\LlmChain\Store\Bridge\MongoDB\Store; use PhpLlm\LlmChain\Store\Document\Metadata; use PhpLlm\LlmChain\Store\Document\TextDocument; +use PhpLlm\LlmChain\Store\Document\Vectorizer; use PhpLlm\LlmChain\Store\Indexer; use Symfony\Component\Dotenv\Dotenv; use Symfony\Component\Uid\Uuid; @@ -52,7 +53,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); -$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings()); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); // initialize the index diff --git a/examples/store/pinecone-similarity-search.php b/examples/store/pinecone-similarity-search.php index c9083da9..ad6b0c9c 100644 --- a/examples/store/pinecone-similarity-search.php +++ b/examples/store/pinecone-similarity-search.php @@ -12,6 +12,7 @@ use PhpLlm\LlmChain\Store\Bridge\Pinecone\Store; use PhpLlm\LlmChain\Store\Document\Metadata; use PhpLlm\LlmChain\Store\Document\TextDocument; +use PhpLlm\LlmChain\Store\Document\Vectorizer; use PhpLlm\LlmChain\Store\Indexer; use Probots\Pinecone\Pinecone; use Symfony\Component\Dotenv\Dotenv; @@ -46,7 +47,8 @@ // create embeddings for documents $platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); -$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store); +$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings()); +$indexer = new Indexer($vectorizer, $store); $indexer->index($documents); $model = new GPT(GPT::GPT_4O_MINI); diff --git a/src/Store/Document/Loader/TextFileLoader.php b/src/Store/Document/Loader/TextFileLoader.php new file mode 100644 index 00000000..af703f4d --- /dev/null +++ b/src/Store/Document/Loader/TextFileLoader.php @@ -0,0 +1,34 @@ + + */ +final readonly class TextFileLoader implements LoaderInterface +{ + public function __invoke(string $source, array $options = []): iterable + { + if (!is_file($source)) { + throw new RuntimeException(\sprintf('File "%s" does not exist.', $source)); + } + + $content = file_get_contents($source); + + if (false === $content) { + throw new RuntimeException(\sprintf('Unable to read file "%s"', $source)); + } + + yield new TextDocument(Uuid::v4(), trim($content), new Metadata([ + 'source' => $source, + ])); + } +} diff --git a/src/Store/Document/LoaderInterface.php b/src/Store/Document/LoaderInterface.php new file mode 100644 index 00000000..2e653785 --- /dev/null +++ b/src/Store/Document/LoaderInterface.php @@ -0,0 +1,19 @@ + + */ +interface LoaderInterface +{ + /** + * @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. + * @param array $options loader specific set of options to control the loading process + * + * @return iterable iterable of TextDocuments loaded from the source + */ + public function __invoke(string $source, array $options = []): iterable; +} diff --git a/src/Store/Document/Transformer/ChainTransformer.php b/src/Store/Document/Transformer/ChainTransformer.php new file mode 100644 index 00000000..16ede042 --- /dev/null +++ b/src/Store/Document/Transformer/ChainTransformer.php @@ -0,0 +1,32 @@ + $transformers + */ + public function __construct(iterable $transformers) + { + $this->transformers = $transformers instanceof \Traversable ? iterator_to_array($transformers) : $transformers; + } + + public function __invoke(iterable $documents, array $options = []): iterable + { + foreach ($this->transformers as $transformer) { + $documents = $transformer($documents, $options); + } + + return $documents; + } +} diff --git a/src/Store/Document/Transformer/ChunkDelayTransformer.php b/src/Store/Document/Transformer/ChunkDelayTransformer.php new file mode 100644 index 00000000..2948379d --- /dev/null +++ b/src/Store/Document/Transformer/ChunkDelayTransformer.php @@ -0,0 +1,44 @@ + + */ +final readonly class ChunkDelayTransformer implements TransformerInterface +{ + public const OPTION_CHUNK_SIZE = 'chunk_size'; + public const OPTION_DELAY = 'delay'; + + public function __construct( + private ClockInterface $clock, + ) { + } + + /** + * @param array{chunk_size?: int, delay?: int} $options + */ + public function __invoke(iterable $documents, array $options = []): iterable + { + $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 50; + $delay = $options[self::OPTION_DELAY] ?? 10; + + $counter = 0; + foreach ($documents as $document) { + yield $document; + ++$counter; + + if ($chunkSize === $counter && 0 !== $delay) { + $this->clock->sleep($delay); + } + } + } +} diff --git a/src/Store/Document/Transformer/TextSplitTransformer.php b/src/Store/Document/Transformer/TextSplitTransformer.php new file mode 100644 index 00000000..60c2d624 --- /dev/null +++ b/src/Store/Document/Transformer/TextSplitTransformer.php @@ -0,0 +1,62 @@ + + */ +final readonly class TextSplitTransformer implements TransformerInterface +{ + public const OPTION_CHUNK_SIZE = 'chunk_size'; + public const OPTION_OVERLAP = 'overlap'; + + /** + * @param array{chunk_size?: int, overlap?: int} $options + */ + public function __invoke(iterable $documents, array $options = []): iterable + { + $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 1000; + $overlap = $options[self::OPTION_OVERLAP] ?? 200; + + if ($overlap < 0 || $overlap >= $chunkSize) { + throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.'); + } + + foreach ($documents as $document) { + if (mb_strlen($document->content) <= $chunkSize) { + yield $document; + + continue; + } + + $text = $document->content; + $length = mb_strlen($text); + $start = 0; + + while ($start < $length) { + $end = min($start + $chunkSize, $length); + $chunkText = mb_substr($text, $start, $end - $start); + + yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([ + 'parent_id' => $document->id, + 'text' => $chunkText, + ...$document->metadata, + ])); + + $start += ($chunkSize - $overlap); + } + } + } +} diff --git a/src/Store/Document/TransformerInterface.php b/src/Store/Document/TransformerInterface.php new file mode 100644 index 00000000..76d8c965 --- /dev/null +++ b/src/Store/Document/TransformerInterface.php @@ -0,0 +1,23 @@ + + */ +interface TransformerInterface +{ + /** + * @param iterable $documents + * @param array $options + * + * @return iterable + */ + public function __invoke(iterable $documents, array $options = []): iterable; +} diff --git a/src/Store/Document/Vectorizer.php b/src/Store/Document/Vectorizer.php new file mode 100644 index 00000000..e9f51b2f --- /dev/null +++ b/src/Store/Document/Vectorizer.php @@ -0,0 +1,53 @@ +model->supports(Capability::INPUT_MULTIPLE)) { + $response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents)); + + $vectors = $response->getContent(); + } else { + $responses = []; + foreach ($documents as $document) { + $responses[] = $this->platform->request($this->model, $document->content); + } + + $vectors = []; + foreach ($responses as $response) { + $vectors = array_merge($vectors, $response->getContent()); + } + } + + $vectorDocuments = []; + foreach ($documents as $i => $document) { + $vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata); + } + + return $vectorDocuments; + } +} diff --git a/src/Store/Indexer.php b/src/Store/Indexer.php index 8057c200..0f4f1bf3 100644 --- a/src/Store/Indexer.php +++ b/src/Store/Indexer.php @@ -4,87 +4,51 @@ namespace PhpLlm\LlmChain\Store; -use PhpLlm\LlmChain\Platform\Capability; -use PhpLlm\LlmChain\Platform\Model; -use PhpLlm\LlmChain\Platform\PlatformInterface; use PhpLlm\LlmChain\Store\Document\TextDocument; -use PhpLlm\LlmChain\Store\Document\VectorDocument; +use PhpLlm\LlmChain\Store\Document\Vectorizer; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; -use Symfony\Component\Clock\Clock; -use Symfony\Component\Clock\ClockInterface; /** + * Converts a collection of TextDocuments into VectorDocuments and pushes them to a store implementation. + * * @author Christopher Hertel */ final readonly class Indexer { - private ClockInterface $clock; - public function __construct( - private PlatformInterface $platform, - private Model $model, + private Vectorizer $vectorizer, private StoreInterface $store, - ?ClockInterface $clock = null, private LoggerInterface $logger = new NullLogger(), ) { - $this->clock = $clock ?? Clock::get(); } /** * @param TextDocument|iterable $documents + * @param int $chunkSize number of documents to vectorize and store in one batch */ - public function index(TextDocument|iterable $documents, int $chunkSize = 0, int $sleep = 0): void + public function index(TextDocument|iterable $documents, int $chunkSize = 50): void { if ($documents instanceof TextDocument) { $documents = [$documents]; } - if ([] === $documents) { - $this->logger->debug('No documents to index'); - - return; - } - - $chunks = 0 !== $chunkSize ? array_chunk($documents, $chunkSize) : [$documents]; - - foreach ($chunks as $chunk) { - $this->store->add(...$this->createVectorDocuments($chunk)); - - if (0 !== $sleep) { - $this->clock->sleep($sleep); - } - } - } - - /** - * @param TextDocument[] $documents - * - * @return VectorDocument[] - */ - private function createVectorDocuments(array $documents): array - { - if ($this->model->supports(Capability::INPUT_MULTIPLE)) { - $response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents)); - - $vectors = $response->getContent(); - } else { - $responses = []; - foreach ($documents as $document) { - $responses[] = $this->platform->request($this->model, $document->content); - } + $counter = 0; + $chunk = []; + foreach ($documents as $document) { + $chunk[] = $document; + ++$counter; - $vectors = []; - foreach ($responses as $response) { - $vectors = array_merge($vectors, $response->getContent()); + if ($chunkSize === \count($chunk)) { + $this->store->add(...$this->vectorizer->vectorizeDocuments($chunk)); + $chunk = []; } } - $vectorDocuments = []; - foreach ($documents as $i => $document) { - $vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata); + if (\count($chunk) > 0) { + $this->store->add(...$this->vectorizer->vectorizeDocuments($chunk)); } - return $vectorDocuments; + $this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter)); } } diff --git a/tests/Fixture/lorem.txt b/tests/Fixture/lorem.txt new file mode 100644 index 00000000..a2600b83 --- /dev/null +++ b/tests/Fixture/lorem.txt @@ -0,0 +1,15 @@ +Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. +Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, +ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, +fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, +justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper +nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, consequat vitae, eleifend ac, enim. +Aliquam lorem ante, dapibus in, viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius +laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur ullamcorper ultricies +nisi. Nam eget dui. Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper libero, +sit amet adipiscing sem neque sed ipsum. Nam quam nunc, blandit vel, luctus pulvinar, hendrerit id, lorem. +Maecenas nec odio et ante tincidunt tempus. Donec vitae sapien ut libero venenatis faucibus. Nullam quis +ante. Etiam sit amet orci eget eros faucibus tincidunt. Duis leo. Sed fringilla mauris sit amet nibh. Donec +sodales sagittis magna. Sed consequat, leo eget bibendum sodales, augue velit cursus nunc, quis gravida +magna mi a libero. Fusce vulputate eleifend sapien. Vestibulum purus quam, scelerisque ut, mollis sed, +nonummy id, met diff --git a/tests/Store/Document/Loader/TextFileLoaderTest.php b/tests/Store/Document/Loader/TextFileLoaderTest.php new file mode 100644 index 00000000..c6224831 --- /dev/null +++ b/tests/Store/Document/Loader/TextFileLoaderTest.php @@ -0,0 +1,54 @@ +content); + self::assertStringEndsWith('nonummy id, met', $document->content); + self::assertSame(1500, \strlen($document->content)); + } + + #[Test] + public function sourceIsPresentInMetadata(): void + { + $loader = new TextFileLoader(); + + $source = \dirname(__DIR__, 3).'/Fixture/lorem.txt'; + $documents = iterator_to_array($loader($source)); + + self::assertCount(1, $documents); + self::assertInstanceOf(TextDocument::class, $document = $documents[0]); + self::assertSame($source, $document->metadata['source']); + } +} diff --git a/tests/Store/Document/Transformer/ChainTransformerTest.php b/tests/Store/Document/Transformer/ChainTransformerTest.php new file mode 100644 index 00000000..58887484 --- /dev/null +++ b/tests/Store/Document/Transformer/ChainTransformerTest.php @@ -0,0 +1,60 @@ +id, $document->content.'-A'); + } + } + }; + + $transformerB = new class implements TransformerInterface { + public function __invoke(iterable $documents, array $options = []): iterable + { + foreach ($documents as $document) { + yield new TextDocument($document->id, $document->content.'-B'); + } + } + }; + + $chain = new ChainTransformer([$transformerA, $transformerB]); + $documents = [ + new TextDocument(Uuid::v4(), 'foo'), + new TextDocument(Uuid::v4(), 'bar'), + ]; + + $result = iterator_to_array($chain->__invoke($documents)); + + self::assertSame('foo-A-B', $result[0]->content); + self::assertSame('bar-A-B', $result[1]->content); + } + + public function testChainTransformerWithNoTransformersReturnsInput(): void + { + $chain = new ChainTransformer([]); + $documents = [new TextDocument(Uuid::v4(), 'baz')]; + + $result = iterator_to_array($chain->__invoke($documents)); + + self::assertSame('baz', $result[0]->content); + } +} diff --git a/tests/Store/Document/Transformer/TextSplitTransformerTest.php b/tests/Store/Document/Transformer/TextSplitTransformerTest.php new file mode 100644 index 00000000..52fe46b5 --- /dev/null +++ b/tests/Store/Document/Transformer/TextSplitTransformerTest.php @@ -0,0 +1,195 @@ +transformer = new TextSplitTransformer(); + } + + #[Test] + public function splitReturnsSingleChunkForShortText(): void + { + $document = new TextDocument(Uuid::v4(), 'short text'); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(1, $chunks); + self::assertSame('short text', $chunks[0]->content); + } + + #[Test] + public function textLength(): void + { + self::assertSame(1500, mb_strlen($this->getLongText())); + } + + #[Test] + public function splitSplitsLongTextWithOverlap(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(2, $chunks); + + self::assertSame(1000, mb_strlen($chunks[0]->content)); + self::assertSame(substr($this->getLongText(), 0, 1000), $chunks[0]->content); + + self::assertSame(700, mb_strlen($chunks[1]->content)); + self::assertSame(substr($this->getLongText(), 800, 700), $chunks[1]->content); + } + + #[Test] + public function splitWithCustomChunkSizeAndOverlap(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 150, + TextSplitTransformer::OPTION_OVERLAP => 25, + ])); + + self::assertCount(12, $chunks); + + self::assertSame(150, mb_strlen($chunks[0]->content)); + self::assertSame(substr($this->getLongText(), 0, 150), $chunks[0]->content); + + self::assertSame(150, mb_strlen($chunks[1]->content)); + self::assertSame(substr($this->getLongText(), 125, 150), $chunks[1]->content); + + self::assertSame(150, mb_strlen($chunks[2]->content)); + self::assertSame(substr($this->getLongText(), 250, 150), $chunks[2]->content); + + self::assertSame(150, mb_strlen($chunks[3]->content)); + self::assertSame(substr($this->getLongText(), 375, 150), $chunks[3]->content); + + self::assertSame(150, mb_strlen($chunks[4]->content)); + self::assertSame(substr($this->getLongText(), 500, 150), $chunks[4]->content); + + self::assertSame(150, mb_strlen($chunks[5]->content)); + self::assertSame(substr($this->getLongText(), 625, 150), $chunks[5]->content); + + self::assertSame(150, mb_strlen($chunks[6]->content)); + self::assertSame(substr($this->getLongText(), 750, 150), $chunks[6]->content); + + self::assertSame(150, mb_strlen($chunks[7]->content)); + self::assertSame(substr($this->getLongText(), 875, 150), $chunks[7]->content); + + self::assertSame(150, mb_strlen($chunks[8]->content)); + self::assertSame(substr($this->getLongText(), 1000, 150), $chunks[8]->content); + + self::assertSame(150, mb_strlen($chunks[9]->content)); + self::assertSame(substr($this->getLongText(), 1125, 150), $chunks[9]->content); + + self::assertSame(150, mb_strlen($chunks[10]->content)); + self::assertSame(substr($this->getLongText(), 1250, 150), $chunks[10]->content); + + self::assertSame(125, mb_strlen($chunks[11]->content)); + self::assertSame(substr($this->getLongText(), 1375, 150), $chunks[11]->content); + } + + #[Test] + public function splitWithZeroOverlap(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_OVERLAP => 0, + ])); + + self::assertCount(2, $chunks); + self::assertSame(substr($this->getLongText(), 0, 1000), $chunks[0]->content); + self::assertSame(substr($this->getLongText(), 1000, 500), $chunks[1]->content); + } + + #[Test] + public function parentIdIsSetInMetadata(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 1000, + TextSplitTransformer::OPTION_OVERLAP => 200, + ])); + + self::assertCount(2, $chunks); + self::assertSame($document->id, $chunks[0]->metadata['parent_id']); + self::assertSame($document->id, $chunks[1]->metadata['parent_id']); + } + + #[Test] + public function metadataIsInherited(): void + { + $document = new TextDocument(Uuid::v4(), $this->getLongText(), new Metadata([ + 'key' => 'value', + 'foo' => 'bar', + ])); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(2, $chunks); + self::assertSame('value', $chunks[0]->metadata['key']); + self::assertSame('bar', $chunks[0]->metadata['foo']); + self::assertSame('value', $chunks[1]->metadata['key']); + self::assertSame('bar', $chunks[1]->metadata['foo']); + } + + #[Test] + public function splitWithChunkSizeLargerThanText(): void + { + $document = new TextDocument(Uuid::v4(), 'tiny'); + + $chunks = iterator_to_array(($this->transformer)([$document])); + + self::assertCount(1, $chunks); + self::assertSame('tiny', $chunks[0]->content); + } + + #[Test] + public function splitWithOverlapGreaterThanChunkSize(): void + { + $document = new TextDocument(Uuid::v4(), 'Abcdefg', new Metadata([])); + self::expectException(InvalidArgumentException::class); + self::expectExceptionMessage('Overlap must be non-negative and less than chunk size.'); + + iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 10, + TextSplitTransformer::OPTION_OVERLAP => 20, + ])); + } + + #[Test] + public function splitWithNegativeOverlap(): void + { + $document = new TextDocument(Uuid::v4(), 'Abcdefg', new Metadata([])); + self::expectException(InvalidArgumentException::class); + self::expectExceptionMessage('Overlap must be non-negative and less than chunk size.'); + + iterator_to_array(($this->transformer)([$document], [ + TextSplitTransformer::OPTION_CHUNK_SIZE => 10, + TextSplitTransformer::OPTION_OVERLAP => -1, + ])); + } + + private function getLongText(): string + { + return trim(file_get_contents(\dirname(__DIR__, 3).'/Fixture/lorem.txt')); + } +} diff --git a/tests/Store/IndexerTest.php b/tests/Store/IndexerTest.php index 83e1a060..c267c06b 100644 --- a/tests/Store/IndexerTest.php +++ b/tests/Store/IndexerTest.php @@ -14,6 +14,7 @@ use PhpLlm\LlmChain\Store\Document\Metadata; use PhpLlm\LlmChain\Store\Document\TextDocument; use PhpLlm\LlmChain\Store\Document\VectorDocument; +use PhpLlm\LlmChain\Store\Document\Vectorizer; use PhpLlm\LlmChain\Store\Indexer; use PhpLlm\LlmChain\Tests\Double\PlatformTestHandler; use PhpLlm\LlmChain\Tests\Double\TestStore; @@ -23,7 +24,6 @@ use PHPUnit\Framework\Attributes\UsesClass; use PHPUnit\Framework\TestCase; use Psr\Log\LoggerInterface; -use Symfony\Component\Clock\MockClock; use Symfony\Component\Uid\Uuid; #[CoversClass(Indexer::class)] @@ -40,18 +40,13 @@ final class IndexerTest extends TestCase { #[Test] - public function embedSingleDocument(): void + public function indexSingleDocument(): void { $document = new TextDocument($id = Uuid::v4(), 'Test content'); $vector = new Vector([0.1, 0.2, 0.3]); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResponse($vector)), new Embeddings()); - $indexer = new Indexer( - PlatformTestHandler::createPlatform(new VectorResponse($vector)), - new Embeddings(), - $store = new TestStore(), - new MockClock(), - ); - + $indexer = new Indexer($vectorizer, $store = new TestStore()); $indexer->index($document); self::assertCount(1, $store->documents); @@ -61,38 +56,27 @@ public function embedSingleDocument(): void } #[Test] - public function embedEmptyDocumentList(): void + public function indexEmptyDocumentList(): void { $logger = self::createMock(LoggerInterface::class); $logger->expects(self::once())->method('debug')->with('No documents to index'); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(), new Embeddings()); - $indexer = new Indexer( - PlatformTestHandler::createPlatform(), - new Embeddings(), - $store = new TestStore(), - new MockClock(), - $logger, - ); - + $indexer = new Indexer($vectorizer, $store = new TestStore(), $logger); $indexer->index([]); self::assertSame([], $store->documents); } #[Test] - public function embedDocumentWithMetadata(): void + public function indexDocumentWithMetadata(): void { $metadata = new Metadata(['key' => 'value']); $document = new TextDocument($id = Uuid::v4(), 'Test content', $metadata); $vector = new Vector([0.1, 0.2, 0.3]); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResponse($vector)), new Embeddings()); - $indexer = new Indexer( - PlatformTestHandler::createPlatform(new VectorResponse($vector)), - new Embeddings(), - $store = new TestStore(), - new MockClock(), - ); - + $indexer = new Indexer($vectorizer, $store = new TestStore()); $indexer->index($document); self::assertSame(1, $store->addCalls); @@ -102,30 +86,4 @@ public function embedDocumentWithMetadata(): void self::assertSame($vector, $store->documents[0]->vector); self::assertSame(['key' => 'value'], $store->documents[0]->metadata->getArrayCopy()); } - - #[Test] - public function embedWithSleep(): void - { - $vector1 = new Vector([0.1, 0.2, 0.3]); - $vector2 = new Vector([0.4, 0.5, 0.6]); - - $document1 = new TextDocument(Uuid::v4(), 'Test content 1'); - $document2 = new TextDocument(Uuid::v4(), 'Test content 2'); - - $indexer = new Indexer( - PlatformTestHandler::createPlatform(new VectorResponse($vector1, $vector2)), - new Embeddings(), - $store = new TestStore(), - $clock = new MockClock('2024-01-01 00:00:00'), - ); - - $indexer->index( - documents: [$document1, $document2], - sleep: 3 - ); - - self::assertSame(1, $store->addCalls); - self::assertCount(2, $store->documents); - self::assertSame('2024-01-01 00:00:03', $clock->now()->format('Y-m-d H:i:s')); - } }