fix(web): batch findMany queries fixes #360

24601 · 24601 · commit 87a92babc6b6 · 2025-06-20T14:11:11.000-07:00
diff --git a/docs/docs/configuration/environment-variables.mdx b/docs/docs/configuration/environment-variables.mdx
@@ -20,6 +20,7 @@ The following environment variables allow you to configure your Sourcebot deploy
 | `DATA_DIR` | `/data` | <p>The directory within the container to store all persistent data. Typically, this directory will be volume mapped such that data is persisted across container restarts (e.g., `docker run -v $(pwd):/data`)</p> |
 | `DATABASE_DATA_DIR` | `$DATA_CACHE_DIR/db` | <p>The data directory for the default Postgres database.</p> |
 | `DATABASE_URL` | `postgresql://postgres@ localhost:5432/sourcebot` | <p>Connection string of your Postgres database. By default, a Postgres database is automatically provisioned at startup within the container.</p><p>If you'd like to use a non-default schema, you can provide it as a parameter in the database url </p> |
+| `DB_QUERY_BATCH_SIZE` | `500` | <p>The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue [#13864](https://github.com/prisma/prisma/issues/13864). Can also be configured via the `dbQueryBatchSize` setting in the configuration file. Valid range: 100-10000.</p> |
 | `EMAIL_FROM_ADDRESS` | `-` | <p>The email address that transactional emails will be sent from. See [this doc](/docs/configuration/transactional-emails) for more info.</p> | 
 | `REDIS_DATA_DIR` | `$DATA_CACHE_DIR/redis` | <p>The data directory for the default Redis instance.</p> |
 | `REDIS_URL` | `redis://localhost:6379` | <p>Connection string of your Redis instance. By default, a Redis database is automatically provisioned at startup within the container.</p> |
diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
@@ -68,6 +68,13 @@
           "type": "boolean",
           "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.",
           "default": false
+        },
+        "dbQueryBatchSize": {
+          "type": "number",
+          "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.",
+          "minimum": 100,
+          "maximum": 10000,
+          "default": 500
         }
       },
       "additionalProperties": false
@@ -182,6 +189,13 @@
           "type": "boolean",
           "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.",
           "default": false
+        },
+        "dbQueryBatchSize": {
+          "type": "number",
+          "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.",
+          "minimum": 100,
+          "maximum": 10000,
+          "default": 500
         }
       },
       "additionalProperties": false
diff --git a/packages/backend/src/constants.ts b/packages/backend/src/constants.ts
@@ -16,4 +16,5 @@ export const DEFAULT_SETTINGS: Settings = {
     repoGarbageCollectionGracePeriodMs: 10 * 1000, // 10 seconds
     repoIndexTimeoutMs: 1000 * 60 * 60 * 2, // 2 hours
     enablePublicAccess: false,
+    dbQueryBatchSize: 500, // Default batch size for database queries
 }
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
@@ -67,6 +67,13 @@ const schema = {
           "type": "boolean",
           "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.",
           "default": false
+        },
+        "dbQueryBatchSize": {
+          "type": "number",
+          "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.",
+          "minimum": 100,
+          "maximum": 10000,
+          "default": 500
         }
       },
       "additionalProperties": false
@@ -181,6 +188,13 @@ const schema = {
           "type": "boolean",
           "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.",
           "default": false
+        },
+        "dbQueryBatchSize": {
+          "type": "number",
+          "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.",
+          "minimum": 100,
+          "maximum": 10000,
+          "default": 500
         }
       },
       "additionalProperties": false
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
@@ -83,6 +83,10 @@ export interface Settings {
    * [Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.
    */
   enablePublicAccess?: boolean;
+  /**
+   * The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.
+   */
+  dbQueryBatchSize?: number;
 }
 /**
  * Search context
diff --git a/packages/web/src/actions.ts b/packages/web/src/actions.ts
@@ -662,28 +662,46 @@ export const getConnectionInfo = async (connectionId: number, domain: string) =>
 export const getRepos = async (domain: string, filter: { status?: RepoIndexingStatus[], connectionId?: number } = {}) => sew(() =>
     withAuth((userId) =>
         withOrgMembership(userId, domain, async ({ org }) => {
-            const repos = await prisma.repo.findMany({
-                where: {
-                    orgId: org.id,
-                    ...(filter.status ? {
-                        repoIndexingStatus: { in: filter.status }
-                    } : {}),
-                    ...(filter.connectionId ? {
-                        connections: {
-                            some: {
-                                connectionId: filter.connectionId
-                            }
-                        }
-                    } : {}),
-                },
-                include: {
+            // Use batched query to prevent memory issues with large datasets.
+            // The batch size is configurable via DB_QUERY_BATCH_SIZE environment variable.
+            const whereClause = {
+                orgId: org.id,
+                ...(filter.status ? {
+                    repoIndexingStatus: { in: filter.status }
+                } : {}),
+                ...(filter.connectionId ? {
                     connections: {
-                        include: {
-                            connection: true,
+                        some: {
+                            connectionId: filter.connectionId
                         }
                     }
-                }
-            });
+                } : {}),
+            };
+
+            // First get the total count
+            const totalCount = await prisma.repo.count({ where: whereClause });
+            
+            const repos = [];
+            const batchSize = env.DB_QUERY_BATCH_SIZE;
+            const totalBatches = Math.ceil(totalCount / batchSize);
+
+            // Execute queries in batches
+            for (let i = 0; i < totalBatches; i++) {
+                const skip = i * batchSize;
+                const batchResults = await prisma.repo.findMany({
+                    where: whereClause,
+                    include: {
+                        connections: {
+                            include: {
+                                connection: true,
+                            }
+                        }
+                    },
+                    skip,
+                    take: batchSize,
+                });
+                repos.push(...batchResults);
+            }
 
             return repos.map((repo) => repositoryQuerySchema.parse({
                 codeHostType: repo.external_codeHostType,
diff --git a/packages/web/src/auth.ts b/packages/web/src/auth.ts
@@ -141,7 +141,7 @@ export const { handlers, signIn, signOut, auth } = NextAuth({
     trustHost: true,
     events: {
         createUser: onCreateUser,
-        signIn: async ({ user, account }) => {
+        signIn: async ({ user, account: _account }) => {
             if (user.id) {
                 await auditService.createAudit({
                     action: "user.signed_in",
diff --git a/packages/web/src/env.mjs b/packages/web/src/env.mjs
@@ -19,6 +19,9 @@ export const env = createEnv({
         TOTAL_MAX_MATCH_COUNT: numberSchema.default(100000),
         ZOEKT_MAX_WALL_TIME_MS: numberSchema.default(10000),
         
+        // Database Query Performance
+        DB_QUERY_BATCH_SIZE: numberSchema.default(500),
+        
         // Auth
         AUTH_SECRET: z.string(),
         AUTH_URL: z.string().url(),
diff --git a/packages/web/src/features/search/searchApi.ts b/packages/web/src/features/search/searchApi.ts
@@ -12,6 +12,7 @@ import * as Sentry from "@sentry/nextjs";
 import { sew, withAuth, withOrgMembership } from "@/actions";
 import { base64Decode } from "@sourcebot/shared";
 import { getAuditService } from "@/ee/features/audit/factory";
+import { batchedFindReposByIds, batchedFindReposByNames } from "@/lib/repoBatchQueries";
 
 const auditService = getAuditService();
 
@@ -198,23 +199,19 @@ export const search = async ({ query, matches, contextLines, whole }: SearchRequ
                 const repoIdentifiers = new Set(Result.Files?.map((file) => file.RepositoryID ?? file.Repository) ?? []);
                 const repos = new Map<string | number, Repo>();
 
-                (await prisma.repo.findMany({
-                    where: {
-                        id: {
-                            in: Array.from(repoIdentifiers).filter((id) => typeof id === "number"),
-                        },
-                        orgId: org.id,
-                    }
-                })).forEach(repo => repos.set(repo.id, repo));
+                // Batch query repos by ID to prevent memory issues with large datasets
+                const numericIds = Array.from(repoIdentifiers).filter((id) => typeof id === "number") as number[];
+                if (numericIds.length > 0) {
+                    const reposByIds = await batchedFindReposByIds(numericIds, org.id);
+                    reposByIds.forEach((repo) => repos.set(repo.id, repo));
+                }
 
-                (await prisma.repo.findMany({
-                    where: {
-                        name: {
-                            in: Array.from(repoIdentifiers).filter((id) => typeof id === "string"),
-                        },
-                        orgId: org.id,
-                    }
-                })).forEach(repo => repos.set(repo.name, repo));
+                // Batch query repos by name to prevent memory issues with large datasets
+                const stringNames = Array.from(repoIdentifiers).filter((id) => typeof id === "string") as string[];
+                if (stringNames.length > 0) {
+                    const reposByNames = await batchedFindReposByNames(stringNames, org.id);
+                    reposByNames.forEach((repo) => repos.set(repo.name, repo));
+                }
 
                 const files = Result.Files?.map((file) => {
                     const fileNameChunks = file.ChunkMatches.filter((chunk) => chunk.FileName);
diff --git a/packages/web/src/lib/repoBatchQueries.ts b/packages/web/src/lib/repoBatchQueries.ts
@@ -0,0 +1,87 @@
+/**
+ * Utility functions for batched Repo queries to handle large datasets efficiently
+ * and prevent memory issues like "Failed to convert rust String into napi string"
+ * 
+ * This is a workaround for the Prisma issue: https://github.com/prisma/prisma/issues/13864
+ * 
+ * The batch size can be configured via the DB_QUERY_BATCH_SIZE environment variable
+ * or the dbQueryBatchSize setting in the configuration file.
+ */
+
+import { Repo } from "@sourcebot/db";
+import { prisma } from "@/prisma";
+import { env } from "@/env.mjs";
+
+const DEFAULT_BATCH_SIZE = env.DB_QUERY_BATCH_SIZE;
+
+/**
+ * Fetches repos by IDs in batches to prevent memory issues
+ * @param ids - Array of repo IDs to fetch
+ * @param orgId - Organization ID to filter by
+ * @param batchSize - Size of each batch (default: 500)
+ * @returns Array of repos
+ */
+export async function batchedFindReposByIds(
+    ids: number[],
+    orgId: number,
+    batchSize: number = DEFAULT_BATCH_SIZE
+): Promise<Repo[]> {
+    if (ids.length === 0) {
+        return [];
+    }
+
+    const results: Repo[] = [];
+    const totalBatches = Math.ceil(ids.length / batchSize);
+
+    for (let i = 0; i < totalBatches; i++) {
+        const startIndex = i * batchSize;
+        const endIndex = Math.min(startIndex + batchSize, ids.length);
+        const batchIds = ids.slice(startIndex, endIndex);
+
+        const batchResults = await prisma.repo.findMany({
+            where: {
+                id: { in: batchIds },
+                orgId,
+            }
+        });
+        results.push(...batchResults);
+    }
+
+    return results;
+}
+
+/**
+ * Fetches repos by names in batches to prevent memory issues
+ * @param names - Array of repo names to fetch
+ * @param orgId - Organization ID to filter by
+ * @param batchSize - Size of each batch (default: 500)
+ * @returns Array of repos
+ */
+export async function batchedFindReposByNames(
+    names: string[],
+    orgId: number,
+    batchSize: number = DEFAULT_BATCH_SIZE
+): Promise<Repo[]> {
+    if (names.length === 0) {
+        return [];
+    }
+
+    const results: Repo[] = [];
+    const totalBatches = Math.ceil(names.length / batchSize);
+
+    for (let i = 0; i < totalBatches; i++) {
+        const startIndex = i * batchSize;
+        const endIndex = Math.min(startIndex + batchSize, names.length);
+        const batchNames = names.slice(startIndex, endIndex);
+
+        const batchResults = await prisma.repo.findMany({
+            where: {
+                name: { in: batchNames },
+                orgId,
+            }
+        });
+        results.push(...batchResults);
+    }
+
+    return results;
+} 
diff --git a/schemas/v3/index.json b/schemas/v3/index.json
@@ -66,6 +66,13 @@
                     "type": "boolean",
                     "description": "[Sourcebot EE] When enabled, allows unauthenticated users to access Sourcebot. Requires an enterprise license with an unlimited number of seats.",
                     "default": false
+                },
+                "dbQueryBatchSize": {
+                    "type": "number",
+                    "description": "The batch size for database queries to prevent memory issues with large datasets. This is a workaround for Prisma issue #13864. Defaults to 500.",
+                    "minimum": 100,
+                    "maximum": 10000,
+                    "default": 500
                 }
             },
             "additionalProperties": false

Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,5 @@ export const DEFAULT_SETTINGS: Settings = {`
`16`	`16`	`repoGarbageCollectionGracePeriodMs: 10 * 1000, // 10 seconds`
`17`	`17`	`repoIndexTimeoutMs: 1000 * 60 * 60 * 2, // 2 hours`
`18`	`18`	`enablePublicAccess: false,`
	`19`	`+ dbQueryBatchSize: 500, // Default batch size for database queries`
`19`	`20`	`}`