Skip to content
This repository was archived by the owner on Jul 16, 2025. It is now read-only.

Commit c76251f

Browse files
committed
feat: implement document loader & transformer pipeline for store indexing
1 parent 9182c34 commit c76251f

14 files changed

+625
-88
lines changed

examples/store/document-splitting.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
use PhpLlm\LlmChain\Store\Document\Loader\TextFileLoader;
6+
use PhpLlm\LlmChain\Store\Document\Transformer\TextSplitTransformer;
7+
8+
require_once dirname(__DIR__, 2).'/vendor/autoload.php';
9+
10+
$loader = new TextFileLoader();
11+
$splitter = new TextSplitTransformer();
12+
$source = dirname(__DIR__, 2).'/tests/Fixture/lorem.txt';
13+
14+
$documents = iterator_to_array($splitter($loader($source)));
15+
16+
dump($documents);
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Loader;
6+
7+
use PhpLlm\LlmChain\Store\Document\LoaderInterface;
8+
use PhpLlm\LlmChain\Store\Document\Metadata;
9+
use PhpLlm\LlmChain\Store\Document\TextDocument;
10+
use PhpLlm\LlmChain\Store\Exception\RuntimeException;
11+
use Symfony\Component\Uid\Uuid;
12+
13+
/**
14+
* @author Christopher Hertel <[email protected]>
15+
*/
16+
final readonly class TextFileLoader implements LoaderInterface
17+
{
18+
public function __invoke(string $source, array $options = []): iterable
19+
{
20+
if (!is_file($source)) {
21+
throw new RuntimeException(\sprintf('File "%s" does not exist.', $source));
22+
}
23+
24+
$content = file_get_contents($source);
25+
26+
if (false === $content) {
27+
throw new RuntimeException(\sprintf('Unable to read file "%s"', $source));
28+
}
29+
30+
yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
31+
'source' => $source,
32+
]));
33+
}
34+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document;
6+
7+
/**
8+
* @author Christopher Hertel <[email protected]>
9+
*/
10+
interface LoaderInterface
11+
{
12+
/**
13+
* @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL.
14+
* @param array<string, mixed> $options loader specific set of options to control the loading process
15+
*
16+
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
17+
*/
18+
public function __invoke(string $source, array $options = []): iterable;
19+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Transformer;
6+
7+
use PhpLlm\LlmChain\Store\Document\TransformerInterface;
8+
9+
final readonly class ChainTransformer implements TransformerInterface
10+
{
11+
/**
12+
* @var TransformerInterface[]
13+
*/
14+
private array $transformers;
15+
16+
/**
17+
* @param iterable<TransformerInterface> $transformers
18+
*/
19+
public function __construct(iterable $transformers)
20+
{
21+
$this->transformers = $transformers instanceof \Traversable ? iterator_to_array($transformers) : $transformers;
22+
}
23+
24+
public function __invoke(iterable $documents, array $options = []): iterable
25+
{
26+
foreach ($this->transformers as $transformer) {
27+
$documents = $transformer($documents, $options);
28+
}
29+
30+
return $documents;
31+
}
32+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Transformer;
6+
7+
use PhpLlm\LlmChain\Store\Document\TransformerInterface;
8+
use Symfony\Component\Clock\ClockInterface;
9+
10+
/**
11+
* This transformer splits the batch of documents into chunks and delays in-between with x seconds, which is useful
12+
* when indexing a lot of documents and facing API rate limits.
13+
*
14+
* @author Christopher Hertel <[email protected]>
15+
*/
16+
final readonly class ChunkDelayTransformer implements TransformerInterface
17+
{
18+
public const OPTION_CHUNK_SIZE = 'chunk_size';
19+
public const OPTION_DELAY = 'delay';
20+
21+
public function __construct(
22+
private ClockInterface $clock,
23+
) {
24+
}
25+
26+
/**
27+
* @param array{chunk_size?: int, delay?: int} $options
28+
*/
29+
public function __invoke(iterable $documents, array $options = []): iterable
30+
{
31+
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 50;
32+
$delay = $options[self::OPTION_DELAY] ?? 10;
33+
34+
$counter = 0;
35+
foreach ($documents as $document) {
36+
yield $document;
37+
++$counter;
38+
39+
if ($chunkSize === $counter && 0 !== $delay) {
40+
$this->clock->sleep($delay);
41+
}
42+
}
43+
}
44+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document\Transformer;
6+
7+
use PhpLlm\LlmChain\Store\Document\Metadata;
8+
use PhpLlm\LlmChain\Store\Document\TextDocument;
9+
use PhpLlm\LlmChain\Store\Document\TransformerInterface;
10+
use PhpLlm\LlmChain\Store\Exception\InvalidArgumentException;
11+
use Symfony\Component\Uid\Uuid;
12+
13+
/**
14+
* Splits a TextDocument into smaller chunks of specified size with optional overlap.
15+
* If the document's content is shorter than the specified chunk size, it returns the original document as a single chunk.
16+
* Overlap cannot be negative and must be less than the chunk size.
17+
*
18+
* @author Christopher Hertel <[email protected]>
19+
*/
20+
final readonly class TextSplitTransformer implements TransformerInterface
21+
{
22+
public const OPTION_CHUNK_SIZE = 'chunk_size';
23+
public const OPTION_OVERLAP = 'overlap';
24+
25+
/**
26+
* @param array{chunk_size?: int, overlap?: int} $options
27+
*/
28+
public function __invoke(iterable $documents, array $options = []): iterable
29+
{
30+
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 1000;
31+
$overlap = $options[self::OPTION_OVERLAP] ?? 200;
32+
33+
if ($overlap < 0 || $overlap >= $chunkSize) {
34+
throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.');
35+
}
36+
37+
foreach ($documents as $document) {
38+
if (mb_strlen($document->content) <= $chunkSize) {
39+
yield $document;
40+
41+
continue;
42+
}
43+
44+
$text = $document->content;
45+
$length = mb_strlen($text);
46+
$start = 0;
47+
48+
while ($start < $length) {
49+
$end = min($start + $chunkSize, $length);
50+
$chunkText = mb_substr($text, $start, $end - $start);
51+
52+
yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
53+
'parent_id' => $document->id,
54+
'text' => $chunkText,
55+
...$document->metadata,
56+
]));
57+
58+
$start += ($chunkSize - $overlap);
59+
}
60+
}
61+
}
62+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document;
6+
7+
/**
8+
* A Transformer is designed to mutate a stream of TextDocuments with the purpose of preparing them for indexing.
9+
* It can reduce or expand the number of documents, modify their content or metadata.
10+
* It should not act blocking, but is expected to iterate over incoming documents and yield prepared ones.
11+
*
12+
* @author Christopher Hertel <[email protected]>
13+
*/
14+
interface TransformerInterface
15+
{
16+
/**
17+
* @param iterable<TextDocument> $documents
18+
* @param array<string, mixed> $options
19+
*
20+
* @return iterable<TextDocument>
21+
*/
22+
public function __invoke(iterable $documents, array $options = []): iterable;
23+
}

src/Store/Document/Vectorizer.php

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Store\Document;
6+
7+
use PhpLlm\LlmChain\Platform\Capability;
8+
use PhpLlm\LlmChain\Platform\Model;
9+
use PhpLlm\LlmChain\Platform\PlatformInterface;
10+
11+
/**
12+
* The Vectorizer encapsulates the logic to convert a collection of TextDocuments into VectorDocuments. It checks for
13+
* the model's capabilities to handle batch processing or handles it with HttpClient's concurrency feature.
14+
*/
15+
final readonly class Vectorizer
16+
{
17+
public function __construct(
18+
private PlatformInterface $platform,
19+
private Model $model,
20+
) {
21+
}
22+
23+
/**
24+
* @param TextDocument[] $documents
25+
*
26+
* @return VectorDocument[]
27+
*/
28+
public function vectorizeDocuments(array $documents): array
29+
{
30+
if ($this->model->supports(Capability::INPUT_MULTIPLE)) {
31+
$response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents));
32+
33+
$vectors = $response->getContent();
34+
} else {
35+
$responses = [];
36+
foreach ($documents as $document) {
37+
$responses[] = $this->platform->request($this->model, $document->content);
38+
}
39+
40+
$vectors = [];
41+
foreach ($responses as $response) {
42+
$vectors = array_merge($vectors, $response->getContent());
43+
}
44+
}
45+
46+
$vectorDocuments = [];
47+
foreach ($documents as $i => $document) {
48+
$vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata);
49+
}
50+
51+
return $vectorDocuments;
52+
}
53+
}

src/Store/Indexer.php

Lines changed: 15 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,87 +4,48 @@
44

55
namespace PhpLlm\LlmChain\Store;
66

7-
use PhpLlm\LlmChain\Platform\Capability;
8-
use PhpLlm\LlmChain\Platform\Model;
9-
use PhpLlm\LlmChain\Platform\PlatformInterface;
107
use PhpLlm\LlmChain\Store\Document\TextDocument;
11-
use PhpLlm\LlmChain\Store\Document\VectorDocument;
8+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
129
use Psr\Log\LoggerInterface;
1310
use Psr\Log\NullLogger;
14-
use Symfony\Component\Clock\Clock;
15-
use Symfony\Component\Clock\ClockInterface;
1611

1712
/**
13+
* Converts a collection of TextDocuments into VectorDocuments and pushes them to a store implementation.
14+
*
1815
* @author Christopher Hertel <[email protected]>
1916
*/
2017
final readonly class Indexer
2118
{
22-
private ClockInterface $clock;
23-
2419
public function __construct(
25-
private PlatformInterface $platform,
26-
private Model $model,
20+
private Vectorizer $vectorizer,
2721
private StoreInterface $store,
28-
?ClockInterface $clock = null,
2922
private LoggerInterface $logger = new NullLogger(),
3023
) {
31-
$this->clock = $clock ?? Clock::get();
3224
}
3325

3426
/**
3527
* @param TextDocument|iterable<TextDocument> $documents
28+
* @param int $chunkSize number of documents to vectorize and store in one batch
3629
*/
37-
public function index(TextDocument|iterable $documents, int $chunkSize = 0, int $sleep = 0): void
30+
public function index(TextDocument|iterable $documents, int $chunkSize = 50): void
3831
{
3932
if ($documents instanceof TextDocument) {
4033
$documents = [$documents];
4134
}
4235

43-
if ([] === $documents) {
44-
$this->logger->debug('No documents to index');
45-
46-
return;
47-
}
48-
49-
$chunks = 0 !== $chunkSize ? array_chunk($documents, $chunkSize) : [$documents];
50-
51-
foreach ($chunks as $chunk) {
52-
$this->store->add(...$this->createVectorDocuments($chunk));
53-
54-
if (0 !== $sleep) {
55-
$this->clock->sleep($sleep);
56-
}
57-
}
58-
}
59-
60-
/**
61-
* @param TextDocument[] $documents
62-
*
63-
* @return VectorDocument[]
64-
*/
65-
private function createVectorDocuments(array $documents): array
66-
{
67-
if ($this->model->supports(Capability::INPUT_MULTIPLE)) {
68-
$response = $this->platform->request($this->model, array_map(fn (TextDocument $document) => $document->content, $documents));
69-
70-
$vectors = $response->getContent();
71-
} else {
72-
$responses = [];
73-
foreach ($documents as $document) {
74-
$responses[] = $this->platform->request($this->model, $document->content);
75-
}
36+
$counter = 0;
37+
$chunk = [];
38+
foreach ($documents as $document) {
39+
$chunk[] = $document;
7640

77-
$vectors = [];
78-
foreach ($responses as $response) {
79-
$vectors = array_merge($vectors, $response->getContent());
41+
if ($chunkSize === \count($chunk)) {
42+
$this->store->add(...$this->vectorizer->vectorizeDocuments($chunk));
43+
$chunk = [];
8044
}
81-
}
8245

83-
$vectorDocuments = [];
84-
foreach ($documents as $i => $document) {
85-
$vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata);
46+
++$counter;
8647
}
8748

88-
return $vectorDocuments;
49+
$this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter));
8950
}
9051
}

0 commit comments

Comments
 (0)