From a9165cae3aa990be2b93729bdf5554f8b9174cd6 Mon Sep 17 00:00:00 2001 From: Corie Watson Date: Wed, 5 Mar 2025 23:44:44 +0000 Subject: [PATCH] WIP --- .../src/__tests__/config/index.test.ts | 76 +++++++++++++++- .../gen-schema-view/src/config/index.ts | 15 ++- .../gen-schema-view/src/config/interactive.ts | 30 +++--- .../src/config/non-interactive.ts | 2 +- .../gen-schema-view/src/schema/genkit.ts | 91 +++++++++++-------- 5 files changed, 156 insertions(+), 58 deletions(-) diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts index cba71b221..4e416aa62 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts @@ -89,6 +89,33 @@ describe("parseConfig", () => { expect(result.bigQueryProjectId).toBe("test-project"); }); + it("should use gemini if specified", async () => { // TODO: This test needs completed + // Setup mocks with useGemini = true + const mockProgram = { + nonInteractive: true, + project: "test-project", + bigQueryProject: "test-bq-project", + dataset: "test-dataset", + tableNamePrefix: "test-prefix", + schemaFiles: ["schema.json"], + useGemini: true, + googleAiKey: "test-key", + geminiAnalyzeCollectionPath: "test-collection", + schemaDirectory: "test-directory", + outputHelp: jest.fn(), + }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (validateNonInteractiveParams as jest.Mock).mockReturnValue(true); + + const result = await parseConfig(); + + expect(result.useGemini).toBe(true); + expect(result.googleAiKey).toBe("test-key"); + expect(result.geminiAnalyzeCollectionPath).toBe("test-collection"); + expect(result.schemaDirectory).toBe("test-directory"); + }); + it("should exit if required parameters are missing", async () => { const mockProgram = { nonInteractive: true, @@ -104,7 +131,7 @@ describe("parseConfig", () => { }); }); - describe("Interactive mode", () => { + describe("Interactive mode without Gemini", () => { it("should return CLI config from inquirer prompts", async () => { // Setup mocks for interactive mode const mockProgram = { @@ -116,6 +143,7 @@ describe("parseConfig", () => { bigQueryProject: "interactive-bq-project", dataset: "interactive-dataset", tableNamePrefix: "interactive-prefix", + useGemini: false, schemaFiles: "schema1.json, schema2.json", }; @@ -155,6 +183,7 @@ describe("parseConfig", () => { bigQueryProject: "test-bq-project", dataset: "test-dataset", tableNamePrefix: "test-prefix", + useGemini: false, schemaFiles: " schema1.json, schema2.json , schema3.json", }; @@ -172,4 +201,49 @@ describe("parseConfig", () => { ]); }); }); + + describe("Interactive mode with Gemini", () => { // TODO: This needs completed + it("should return CLI config from inquirer prompts", async () => { + // Setup mocks for interactive mode + const mockProgram = { + nonInteractive: false, + }; + + const mockPromptResponse = { + project: "interactive-project", + bigQueryProject: "interactive-bq-project", + dataset: "interactive-dataset", + tableNamePrefix: "interactive-prefix", + useGemini: true, + googleAiKey: "test-key", + geminiAnalyzeCollectionPath: "test-collection", + schemaDirectory: "test-directory", + }; + + const mockSchemas = { + schema1: { fields: { field1: { type: "string" } } }, + schema2: { fields: { field2: { type: "number" } } }, + }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (promptInquirer as jest.Mock).mockResolvedValue(mockPromptResponse); + (readSchemas as jest.Mock).mockReturnValue(mockSchemas); + + const result = await parseConfig(); + + expect(parseProgram).toHaveBeenCalled(); + expect(promptInquirer).toHaveBeenCalled(); + expect(readSchemas).toHaveBeenCalledWith([ + "schema1.json", + "schema2.json", + ]); + expect(result).toEqual({ + projectId: "interactive-project", + bigQueryProjectId: "interactive-bq-project", + datasetId: "interactive-dataset", + tableNamePrefix: "interactive-prefix", + schemas: mockSchemas, + }); + }); + }); }); diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts index c235fbf9b..a39d05bfe 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts @@ -14,8 +14,10 @@ export interface CliConfig { tableNamePrefix: string; schemas: { [schemaName: string]: FirestoreSchema }; useGemini?: boolean; + geminiAnalyzeCollectionPath?: string; agentSampleSize?: number; googleAiKey?: string; + schemaDirectory?: string; } export async function parseConfig(): Promise { @@ -33,8 +35,10 @@ export async function parseConfig(): Promise { tableNamePrefix: program.tableNamePrefix, useGemini: program.useGemini, schemas: !program.useGemini ? readSchemas(program.schemaFiles) : {}, + geminiAnalyzeCollectionPath: program.geminiAnalyzeCollectionPath, agentSampleSize: DEFAULT_SAMPLE_SIZE, googleAiKey: program.googleAiKey, + schemaDirectory: program.schemaDirectory, }; } const { @@ -44,20 +48,23 @@ export async function parseConfig(): Promise { tableNamePrefix, schemaFiles, useGemini, - // TODO: rename? + geminiAnalyzeCollectionPath, googleAiKey, + schemaDirectory, } = await promptInquirer(); return { projectId: project, bigQueryProjectId: bigQueryProject, datasetId: dataset, - tableNamePrefix: tableNamePrefix, + tableNamePrefix, schemas: !useGemini ? readSchemas( schemaFiles.split(",").map((schemaFileName) => schemaFileName.trim()) ) : {}, - useGemini: useGemini, + useGemini, + geminiAnalyzeCollectionPath, agentSampleSize: DEFAULT_SAMPLE_SIZE, - googleAiKey: googleAiKey, + googleAiKey, + schemaDirectory, }; } diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts index 8ff5de796..c532d8704 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts @@ -18,7 +18,7 @@ export const questions = [ { message: "What is your Firebase project ID?", name: "project", - default: process.env.PROJECT_ID, + default: "dev-extensions-testing", type: "input", validate: (value) => validateInput(value, "project ID", FIRESTORE_VALID_CHARACTERS), @@ -27,7 +27,7 @@ export const questions = [ message: "What is your Google Cloud Project ID for BigQuery? (can be the same as the Firebase project ID)", name: "bigQueryProject", - default: process.env.PROJECT_ID, + default: "dev-extensions-testing", type: "input", validate: (value) => validateInput(value, "BigQuery project ID", GCP_PROJECT_VALID_CHARACTERS), @@ -37,23 +37,26 @@ export const questions = [ "What is the ID of the BigQuery dataset the raw changelog lives in? (The dataset and the raw changelog must already exist!)", name: "dataset", type: "input", + default: "2025_stress_test", validate: (value) => validateInput(value, "dataset ID", BIGQUERY_VALID_CHARACTERS), }, { message: - "What is the name of the Cloud Firestore collection for which you want to generate a schema view?", + "What prefix should be used for the names of the views generated by this script?", name: "tableNamePrefix", type: "input", + default: "2025_stress_test", validate: (value) => validateInput(value, "table name prefix", BIGQUERY_VALID_CHARACTERS), + requiredOption: false, }, { message: "Would you like to use a Gemini to automatically analyze your data and generate a draft schema?", name: "useGemini", type: "confirm", - default: false, + default: true, }, { message: @@ -62,20 +65,11 @@ export const questions = [ type: "input", when: (answers) => !answers.useGemini, }, - // TODO: I dont think this is required as we have it above - // TODO: can we make the questions conditional? if we select useGemini then dont ask about finding schema files? - // { - // message: "What is the Firestore collection path you want to analyze?", - // name: "collectionPath", - // type: "input", - // when: (answers) => answers.useGemini, - // validate: (value) => - // validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS), - // }, { message: "Please provide your Google AI API Key:", name: "googleAiKey", type: "password", + default: "AIzaSyAv_SeZkZCo_qVjrysxxtasHf6sN5yG9wg", when: (answers) => answers.useGemini, validate: (value) => { if (!value || value.trim() === "") { @@ -84,6 +78,14 @@ export const questions = [ return true; }, }, + { + message: "What is the Firestore collection path you want Gemini to analyze?", + name: "geminiAnalyzeCollectionPath", + type: "input", + when: (answers) => answers.useGemini, + validate: (value) => + validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS), + }, { message: "Where should the generated schema files be stored?", name: "schemaDirectory", diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts index ab35af9a8..1af2b2384 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts @@ -50,7 +50,7 @@ export const configureProgram = () => { false ) .option( - "-c, --collection-path ", + "-c, --gemini-analyze-collection-path ", "Firestore collection path for Gemini to analyze" ) .option( diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts index ee1d0f804..9b35dc715 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts @@ -2,10 +2,9 @@ import type { CliConfig } from "../config"; import firebase = require("firebase-admin"); import { genkit, z } from "genkit"; import { googleAI, gemini20Flash } from "@genkit-ai/googleai"; -import * as fs from "fs/promises"; +import * as fs from "fs"; import * as path from "path"; import inquirer from "inquirer"; -import {SchemaSchema} from './genkitSchema' export async function sampleFirestoreDocuments( collectionPath: string, @@ -25,7 +24,6 @@ export async function sampleFirestoreDocuments( return serializeDocument(data); }); - console.log(`Successfully sampled ${documents.length} documents.`); return documents; } catch (error) { console.error("Error sampling documents:", error); @@ -67,44 +65,19 @@ function serializeDocument(data: any): any { return data; } -/** - * Writes a schema file to the specified directory if it does not already exist. - * - * @param {string} schemaDirectory - The directory where schema files are stored. - * @param {string} fileName - The name of the schema file to write. - * @param {string} content - The content of the schema file as a JSON string. - * @returns {Promise} - A message indicating success or an error if the file already exists. - */ -const writeSchemaFile = async ( - schemaDirectory: string, - fileName: string, - content: string -): Promise => { - const filePath = path.join(schemaDirectory, fileName); - try { - await fs.access(filePath); - return "Error: Schema file already exists"; - } catch { - await fs.writeFile(filePath, content); - return "Schema created successfully"; - } -}; - const biqquerySchemaPrompt = ({ - collectionName, + collectionPath, sampleData, - tablePrefix, }: { - collectionName: string; + collectionPath: string; sampleData: any[]; - tablePrefix: string; }) => ` You are a Schema Management Agent for Generating BigQuery schemas from Firestore Collections. Your primary tasks are: 1. Analyze the provided sample documents 2. Generate an appropriate BigQuery schema - I will provide you with sample documents from the collection "${collectionName}". + I will provide you with sample documents from the collection "${collectionPath}". Here are the sample documents to analyze: ${JSON.stringify(sampleData, null, 2)} @@ -194,14 +167,19 @@ const biqquerySchemaPrompt = ({ export const generateSchemaFilesWithGemini = async (config: CliConfig) => { // get sample data from Firestore const sampleData = await sampleFirestoreDocuments( - config.tableNamePrefix!, + config.geminiAnalyzeCollectionPath!, config.agentSampleSize! ); + if (sampleData.length === 0) { + console.log("Operation cancelled. No sample data found. Either the collection is empty or the collection path is incorrect."); + process.exit(0); + } + console.log(`Successfully sampled ${sampleData.length} documents from collection ${config.geminiAnalyzeCollectionPath}`); + const prompt = biqquerySchemaPrompt({ - collectionName: config.tableNamePrefix!, + collectionPath: config.geminiAnalyzeCollectionPath!, sampleData, - tablePrefix: config.tableNamePrefix, }); // initialize genkit with googleAI plugin @@ -218,12 +196,49 @@ export const generateSchemaFilesWithGemini = async (config: CliConfig) => { model: gemini20Flash, prompt, output: { - format: 'json', - schema: SchemaSchema + format: "json", + schema: z.object({ + fields: z.array(z.object({ + name: z.string(), + type: z.string(), + description: z.string(), + fields: z.array(z.object({ + name: z.string(), + type: z.string(), + description: z.string(), + fields: z.array(z.object({ + name: z.string(), + type: z.string(), + description: z.string(), + column_name: z.string().optional(), + })), + })), + })), + }) + }}); + + const filePath = path.join(config.schemaDirectory, `${config.tableNamePrefix}.json`); + + // Check if a file exists + if (fs.existsSync(filePath)) { + const overwriteConfirm = await inquirer.prompt([ + { + type: "confirm", + name: "proceed", + message: + "Schema file already exists. Would you like to overwrite it?", + default: false, + }, + ]); + + if (!overwriteConfirm.proceed) { + console.log("Operation cancelled. Please choose a different schema file name."); + process.exit(0); } - }); - await writeSchemaFile("./schemas", `${config.tableNamePrefix}.json`, text); + await fs.promises.writeFile(filePath, text); + } + // confirm with user that schema file is correct const confirmation = await inquirer.prompt([ {