Skip to content

Commit d1c2e81

Browse files
committed
feat: implement document loader & transformer for store indexing (#343)
1 parent c6e32b6 commit d1c2e81

20 files changed

+825
-110
lines changed

examples/store/document-splitting.php

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
use Symfony\AI\Store\Document\Loader\TextFileLoader;
13+
use Symfony\AI\Store\Document\Transformer\TextSplitTransformer;
14+
15+
require_once dirname(__DIR__, 2).'/vendor/autoload.php';
16+
17+
$loader = new TextFileLoader();
18+
$splitter = new TextSplitTransformer();
19+
$source = dirname(__DIR__, 2).'/tests/Fixture/lorem.txt';
20+
21+
$documents = iterator_to_array($splitter($loader($source)));
22+
23+
dump($documents);
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
use Symfony\AI\Platform\Bridge\OpenAI\Embeddings;
13+
use Symfony\AI\Platform\Bridge\OpenAI\PlatformFactory;
14+
use Symfony\AI\Store\Document\TextDocument;
15+
use Symfony\AI\Store\Document\VectorDocument;
16+
use Symfony\AI\Store\Document\Vectorizer;
17+
use Symfony\Component\Dotenv\Dotenv;
18+
use Symfony\Component\Uid\Uuid;
19+
20+
require_once dirname(__DIR__, 2).'/vendor/autoload.php';
21+
(new Dotenv())->loadEnv(dirname(__DIR__, 2).'/.env');
22+
23+
if (empty($_ENV['OPENAI_API_KEY'])) {
24+
echo 'Please set the OPENAI_API_KEY environment variable.'.\PHP_EOL;
25+
exit(1);
26+
}
27+
28+
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
29+
$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE);
30+
31+
$textDocuments = [
32+
new TextDocument(Uuid::v4(), 'Hello World'),
33+
new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'),
34+
new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'),
35+
];
36+
37+
$vectorizer = new Vectorizer($platform, $embeddings);
38+
$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments);
39+
40+
dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
use Doctrine\DBAL\DriverManager;
13+
use Doctrine\DBAL\Tools\DsnParser;
14+
use Symfony\AI\Agent\Agent;
15+
use Symfony\AI\Agent\Toolbox\AgentProcessor;
16+
use Symfony\AI\Agent\Toolbox\Tool\SimilaritySearch;
17+
use Symfony\AI\Agent\Toolbox\Toolbox;
18+
use Symfony\AI\Platform\Bridge\Google\Embeddings;
19+
use Symfony\AI\Platform\Bridge\Google\Embeddings\TaskType;
20+
use Symfony\AI\Platform\Bridge\Google\Gemini;
21+
use Symfony\AI\Platform\Bridge\Google\PlatformFactory;
22+
use Symfony\AI\Platform\Message\Message;
23+
use Symfony\AI\Platform\Message\MessageBag;
24+
use Symfony\AI\Store\Bridge\MariaDB\Store;
25+
use Symfony\AI\Store\Document\Metadata;
26+
use Symfony\AI\Store\Document\TextDocument;
27+
use Symfony\AI\Store\Document\Vectorizer;
28+
use Symfony\AI\Store\Indexer;
29+
use Symfony\Component\Dotenv\Dotenv;
30+
use Symfony\Component\Uid\Uuid;
31+
32+
require_once dirname(__DIR__, 2).'/vendor/autoload.php';
33+
(new Dotenv())->loadEnv(dirname(__DIR__, 2).'/.env');
34+
35+
if (empty($_ENV['GOOGLE_API_KEY']) || empty($_ENV['MARIADB_URI'])) {
36+
echo 'Please set GOOGLE_API_KEY and MARIADB_URI environment variables.'.\PHP_EOL;
37+
exit(1);
38+
}
39+
40+
// initialize the store
41+
$store = Store::fromDbal(
42+
connection: DriverManager::getConnection((new DsnParser())->parse($_ENV['MARIADB_URI'])),
43+
tableName: 'my_table',
44+
indexName: 'my_index',
45+
vectorFieldName: 'embedding',
46+
);
47+
48+
// our data
49+
$movies = [
50+
['title' => 'Inception', 'description' => 'A skilled thief is given a chance at redemption if he can successfully perform inception, the act of planting an idea in someone\'s subconscious.', 'director' => 'Christopher Nolan'],
51+
['title' => 'The Matrix', 'description' => 'A hacker discovers the world he lives in is a simulated reality and joins a rebellion to overthrow its controllers.', 'director' => 'The Wachowskis'],
52+
['title' => 'The Godfather', 'description' => 'The aging patriarch of an organized crime dynasty transfers control of his empire to his reluctant son.', 'director' => 'Francis Ford Coppola'],
53+
];
54+
55+
// create embeddings and documents
56+
foreach ($movies as $i => $movie) {
57+
$documents[] = new TextDocument(
58+
id: Uuid::v4(),
59+
content: 'Title: '.$movie['title'].\PHP_EOL.'Director: '.$movie['director'].\PHP_EOL.'Description: '.$movie['description'],
60+
metadata: new Metadata($movie),
61+
);
62+
}
63+
64+
// initialize the table
65+
$store->initialize(['dimensions' => 768]);
66+
67+
// create embeddings for documents
68+
$platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']);
69+
$embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]);
70+
$vectorizer = new Vectorizer($platform, $embeddings);
71+
$indexer = new Indexer($vectorizer, $store);
72+
$indexer->index($documents);
73+
74+
$model = new Gemini(Gemini::GEMINI_2_FLASH_LITE);
75+
76+
$similaritySearch = new SimilaritySearch($platform, $embeddings, $store);
77+
$toolbox = Toolbox::create($similaritySearch);
78+
$processor = new AgentProcessor($toolbox);
79+
$agent = new Agent($platform, $model, [$processor], [$processor]);
80+
81+
$messages = new MessageBag(
82+
Message::forSystem('Please answer all user questions only using SimilaritySearch function.'),
83+
Message::ofUser('Which movie fits the theme of the mafia?')
84+
);
85+
$response = $agent->call($messages);
86+
87+
echo $response->getContent().\PHP_EOL;

examples/store/mariadb-similarity-search.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
use Symfony\AI\Store\Bridge\MariaDB\Store;
2222
use Symfony\AI\Store\Document\Metadata;
2323
use Symfony\AI\Store\Document\TextDocument;
24+
use Symfony\AI\Store\Document\Vectorizer;
2425
use Symfony\AI\Store\Indexer;
2526
use Symfony\Component\Dotenv\Dotenv;
2627
use Symfony\Component\Uid\Uuid;
@@ -62,7 +63,8 @@
6263

6364
// create embeddings for documents
6465
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
65-
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
66+
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
67+
$indexer = new Indexer($vectorizer, $store);
6668
$indexer->index($documents);
6769

6870
$model = new GPT(GPT::GPT_4O_MINI);

examples/store/mongodb-similarity-search.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
use Symfony\AI\Store\Bridge\MongoDB\Store;
2323
use Symfony\AI\Store\Document\Metadata;
2424
use Symfony\AI\Store\Document\TextDocument;
25+
use Symfony\AI\Store\Document\Vectorizer;
2526
use Symfony\AI\Store\Indexer;
2627
use Symfony\Component\Dotenv\Dotenv;
2728
use Symfony\Component\Uid\Uuid;
@@ -61,7 +62,8 @@
6162

6263
// create embeddings for documents
6364
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
64-
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
65+
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
66+
$indexer = new Indexer($vectorizer, $store);
6567
$indexer->index($documents);
6668

6769
// initialize the index

examples/store/pinecone-similarity-search.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* file that was distributed with this source code.
1010
*/
1111

12+
use PhpLlm\LlmChain\Store\Document\Vectorizer;
1213
use Probots\Pinecone\Pinecone;
1314
use Symfony\AI\Agent\Agent;
1415
use Symfony\AI\Agent\Toolbox\AgentProcessor;
@@ -55,7 +56,8 @@
5556

5657
// create embeddings for documents
5758
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
58-
$indexer = new Indexer($platform, $embeddings = new Embeddings(), $store);
59+
$vectorizer = new Vectorizer($platform, $embeddings = new Embeddings());
60+
$indexer = new Indexer($vectorizer, $store);
5961
$indexer->index($documents);
6062

6163
$model = new GPT(GPT::GPT_4O_MINI);

fixtures/lorem.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa.
2+
Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis,
3+
ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo,
4+
fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae,
5+
justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper
6+
nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, consequat vitae, eleifend ac, enim.
7+
Aliquam lorem ante, dapibus in, viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius
8+
laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur ullamcorper ultricies
9+
nisi. Nam eget dui. Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper libero,
10+
sit amet adipiscing sem neque sed ipsum. Nam quam nunc, blandit vel, luctus pulvinar, hendrerit id, lorem.
11+
Maecenas nec odio et ante tincidunt tempus. Donec vitae sapien ut libero venenatis faucibus. Nullam quis
12+
ante. Etiam sit amet orci eget eros faucibus tincidunt. Duis leo. Sed fringilla mauris sit amet nibh. Donec
13+
sodales sagittis magna. Sed consequat, leo eget bibendum sodales, augue velit cursus nunc, quis gravida
14+
magna mi a libero. Fusce vulputate eleifend sapien. Vestibulum purus quam, scelerisque ut, mollis sed,
15+
nonummy id, met

src/mcp-sdk/tests/Fixtures/InMemoryTransport.php

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
<?php
22

3-
4-
53
/*
64
* This file is part of the Symfony package.
75
*
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\AI\Store\Document\Loader;
13+
14+
use Symfony\AI\Store\Document\LoaderInterface;
15+
use Symfony\AI\Store\Document\Metadata;
16+
use Symfony\AI\Store\Document\TextDocument;
17+
use Symfony\AI\Store\Exception\RuntimeException;
18+
use Symfony\Component\Uid\Uuid;
19+
20+
/**
21+
* @author Christopher Hertel <[email protected]>
22+
*/
23+
final readonly class TextFileLoader implements LoaderInterface
24+
{
25+
public function __invoke(string $source, array $options = []): iterable
26+
{
27+
if (!is_file($source)) {
28+
throw new RuntimeException(\sprintf('File "%s" does not exist.', $source));
29+
}
30+
31+
$content = file_get_contents($source);
32+
33+
if (false === $content) {
34+
throw new RuntimeException(\sprintf('Unable to read file "%s"', $source));
35+
}
36+
37+
yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
38+
'source' => $source,
39+
]));
40+
}
41+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\AI\Store\Document;
13+
14+
/**
15+
* @author Christopher Hertel <[email protected]>
16+
*/
17+
interface LoaderInterface
18+
{
19+
/**
20+
* @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL.
21+
* @param array<string, mixed> $options loader specific set of options to control the loading process
22+
*
23+
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
24+
*/
25+
public function __invoke(string $source, array $options = []): iterable;
26+
}

0 commit comments

Comments
 (0)