Skip to content
This repository was archived by the owner on Jul 16, 2025. It is now read-only.

feat: implement document loader & transformer for store indexing #343

Merged
merged 1 commit into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/store/document-splitting.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

declare(strict_types=1);

use PhpLlm\LlmChain\Store\Document\Loader\TextFileLoader;
use PhpLlm\LlmChain\Store\Document\Transformer\TextSplitTransformer;

require_once dirname(__DIR__, 2).'/vendor/autoload.php';

$loader = new TextFileLoader();
$splitter = new TextSplitTransformer();
$source = dirname(__DIR__, 2).'/tests/Fixture/lorem.txt';

$documents = iterator_to_array($splitter($loader($source)));

dump($documents);
33 changes: 33 additions & 0 deletions examples/store/document-vectorizing.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?php

declare(strict_types=1);

use PhpLlm\LlmChain\Platform\Bridge\OpenAI\Embeddings;
use PhpLlm\LlmChain\Platform\Bridge\OpenAI\PlatformFactory;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Document\VectorDocument;
use PhpLlm\LlmChain\Store\Document\Vectorizer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;

require_once dirname(__DIR__, 2).'/vendor/autoload.php';
(new Dotenv())->loadEnv(dirname(__DIR__, 2).'/.env');

if (empty($_ENV['OPENAI_API_KEY'])) {
echo 'Please set the OPENAI_API_KEY environment variable.'.\PHP_EOL;
exit(1);
}

$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE);

$textDocuments = [
new TextDocument(Uuid::v4(), 'Hello World'),
new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'),
new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'),
];

$vectorizer = new Vectorizer($platform, $embeddings);
$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments);

dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));
4 changes: 3 additions & 1 deletion examples/store/mariadb-similarity-search-gemini.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store;
use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Document\Vectorizer;
use PhpLlm\LlmChain\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;
Expand Down Expand Up @@ -57,7 +58,8 @@
// create embeddings for documents
$platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']);
$embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]);
$indexer = new Indexer($platform, $embeddings, $store);
$vectorizer = new Vectorizer($platform, $embeddings);
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

$model = new Gemini(Gemini::GEMINI_2_FLASH_LITE);
Expand Down
4 changes: 3 additions & 1 deletion examples/store/mariadb-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store;
use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Document\Vectorizer;
use PhpLlm\LlmChain\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;
Expand Down Expand Up @@ -55,7 +56,8 @@

// create embeddings for documents
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

$model = new GPT(GPT::GPT_4O_MINI);
Expand Down
4 changes: 3 additions & 1 deletion examples/store/mongodb-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
use PhpLlm\LlmChain\Store\Bridge\MongoDB\Store;
use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Document\Vectorizer;
use PhpLlm\LlmChain\Store\Indexer;
use Symfony\Component\Dotenv\Dotenv;
use Symfony\Component\Uid\Uuid;
Expand Down Expand Up @@ -52,7 +53,8 @@

// create embeddings for documents
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

// initialize the index
Expand Down
4 changes: 3 additions & 1 deletion examples/store/pinecone-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
use PhpLlm\LlmChain\Store\Bridge\Pinecone\Store;
use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Document\Vectorizer;
use PhpLlm\LlmChain\Store\Indexer;
use Probots\Pinecone\Pinecone;
use Symfony\Component\Dotenv\Dotenv;
Expand Down Expand Up @@ -46,7 +47,8 @@

// create embeddings for documents
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
$indexer = new Indexer($vectorizer, $store);
$indexer->index($documents);

$model = new GPT(GPT::GPT_4O_MINI);
Expand Down
34 changes: 34 additions & 0 deletions src/Store/Document/Loader/TextFileLoader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document\Loader;

use PhpLlm\LlmChain\Store\Document\LoaderInterface;
use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Exception\RuntimeException;
use Symfony\Component\Uid\Uuid;

/**
* @author Christopher Hertel <[email protected]>
*/
final readonly class TextFileLoader implements LoaderInterface
{
public function __invoke(string $source, array $options = []): iterable
{
if (!is_file($source)) {
throw new RuntimeException(\sprintf('File "%s" does not exist.', $source));
}

$content = file_get_contents($source);

if (false === $content) {
throw new RuntimeException(\sprintf('Unable to read file "%s"', $source));
}

yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
'source' => $source,
]));
}
}
19 changes: 19 additions & 0 deletions src/Store/Document/LoaderInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document;

/**
* @author Christopher Hertel <[email protected]>
*/
interface LoaderInterface
{
/**
* @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL.
* @param array<string, mixed> $options loader specific set of options to control the loading process
*
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
*/
public function __invoke(string $source, array $options = []): iterable;
}
32 changes: 32 additions & 0 deletions src/Store/Document/Transformer/ChainTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document\Transformer;

use PhpLlm\LlmChain\Store\Document\TransformerInterface;

final readonly class ChainTransformer implements TransformerInterface
{
/**
* @var TransformerInterface[]
*/
private array $transformers;

/**
* @param iterable<TransformerInterface> $transformers
*/
public function __construct(iterable $transformers)
{
$this->transformers = $transformers instanceof \Traversable ? iterator_to_array($transformers) : $transformers;
}

public function __invoke(iterable $documents, array $options = []): iterable
{
foreach ($this->transformers as $transformer) {
$documents = $transformer($documents, $options);
}

return $documents;
}
}
44 changes: 44 additions & 0 deletions src/Store/Document/Transformer/ChunkDelayTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document\Transformer;

use PhpLlm\LlmChain\Store\Document\TransformerInterface;
use Symfony\Component\Clock\ClockInterface;

/**
* This transformer splits the batch of documents into chunks and delays in-between with x seconds, which is useful
* when indexing a lot of documents and facing API rate limits.
*
* @author Christopher Hertel <[email protected]>
*/
final readonly class ChunkDelayTransformer implements TransformerInterface
{
public const OPTION_CHUNK_SIZE = 'chunk_size';
public const OPTION_DELAY = 'delay';

public function __construct(
private ClockInterface $clock,
) {
}

/**
* @param array{chunk_size?: int, delay?: int} $options
*/
public function __invoke(iterable $documents, array $options = []): iterable
{
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 50;
$delay = $options[self::OPTION_DELAY] ?? 10;

$counter = 0;
foreach ($documents as $document) {
yield $document;
++$counter;

if ($chunkSize === $counter && 0 !== $delay) {
$this->clock->sleep($delay);
}
}
}
}
62 changes: 62 additions & 0 deletions src/Store/Document/Transformer/TextSplitTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document\Transformer;

use PhpLlm\LlmChain\Store\Document\Metadata;
use PhpLlm\LlmChain\Store\Document\TextDocument;
use PhpLlm\LlmChain\Store\Document\TransformerInterface;
use PhpLlm\LlmChain\Store\Exception\InvalidArgumentException;
use Symfony\Component\Uid\Uuid;

/**
* Splits a TextDocument into smaller chunks of specified size with optional overlap.
* If the document's content is shorter than the specified chunk size, it returns the original document as a single chunk.
* Overlap cannot be negative and must be less than the chunk size.
*
* @author Christopher Hertel <[email protected]>
*/
final readonly class TextSplitTransformer implements TransformerInterface
{
public const OPTION_CHUNK_SIZE = 'chunk_size';
public const OPTION_OVERLAP = 'overlap';

/**
* @param array{chunk_size?: int, overlap?: int} $options
*/
public function __invoke(iterable $documents, array $options = []): iterable
{
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 1000;
$overlap = $options[self::OPTION_OVERLAP] ?? 200;

if ($overlap < 0 || $overlap >= $chunkSize) {
throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.');
}

foreach ($documents as $document) {
if (mb_strlen($document->content) <= $chunkSize) {
yield $document;

continue;
}

$text = $document->content;
$length = mb_strlen($text);
$start = 0;

while ($start < $length) {
$end = min($start + $chunkSize, $length);
$chunkText = mb_substr($text, $start, $end - $start);

yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
'parent_id' => $document->id,
'text' => $chunkText,
...$document->metadata,
]));

$start += ($chunkSize - $overlap);
}
}
}
}
23 changes: 23 additions & 0 deletions src/Store/Document/TransformerInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document;

/**
* A Transformer is designed to mutate a stream of TextDocuments with the purpose of preparing them for indexing.
* It can reduce or expand the number of documents, modify their content or metadata.
* It should not act blocking, but is expected to iterate over incoming documents and yield prepared ones.
*
* @author Christopher Hertel <[email protected]>
*/
interface TransformerInterface
{
/**
* @param iterable<TextDocument> $documents
* @param array<string, mixed> $options
*
* @return iterable<TextDocument>
*/
public function __invoke(iterable $documents, array $options = []): iterable;
}
53 changes: 53 additions & 0 deletions src/Store/Document/Vectorizer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\Document;

use PhpLlm\LlmChain\Platform\Capability;
use PhpLlm\LlmChain\Platform\Model;
use PhpLlm\LlmChain\Platform\PlatformInterface;

/**
* The Vectorizer encapsulates the logic to convert a collection of TextDocuments into VectorDocuments. It checks for
* the model's capabilities to handle batch processing or handles it with HttpClient's concurrency feature.
*/
final readonly class Vectorizer
{
public function __construct(
private PlatformInterface $platform,
private Model $model,
) {
}

/**
* @param TextDocument[] $documents
*
* @return VectorDocument[]
*/
public function vectorizeDocuments(array $documents): array
{
if ($this->model->supports(Capability::INPUT_MULTIPLE)) {
$response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents));

$vectors = $response->getContent();
} else {
$responses = [];
foreach ($documents as $document) {
$responses[] = $this->platform->request($this->model, $document->content);
}

$vectors = [];
foreach ($responses as $response) {
$vectors = array_merge($vectors, $response->getContent());
}
}

$vectorDocuments = [];
foreach ($documents as $i => $document) {
$vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata);
}

return $vectorDocuments;
}
}
Loading