diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts new file mode 100644 index 000000000..cba71b221 --- /dev/null +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts @@ -0,0 +1,175 @@ +import { parseConfig } from "../../../src/config"; +import { promptInquirer } from "../../../src/config/interactive"; +import { + parseProgram, + validateNonInteractiveParams, +} from "../../../src/config/non-interactive"; +import { readSchemas } from "../../../src/schema-loader-utils"; + +// Mock dependencies +jest.mock("../../../src/config/interactive", () => ({ + promptInquirer: jest.fn(), +})); + +jest.mock("../../../src/config/non-interactive", () => ({ + parseProgram: jest.fn(), + validateNonInteractiveParams: jest.fn(), +})); + +jest.mock("../../../src/schema-loader-utils", () => ({ + readSchemas: jest.fn(), +})); + +// Mock process.exit to prevent tests from actually exiting +const mockExit = jest.spyOn(process, "exit").mockImplementation((code) => { + throw new Error(`Process exited with code ${code}`); +}); + +describe("parseConfig", () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + describe("Non-interactive mode", () => { + it("should return CLI config from command line arguments", async () => { + // Setup mocks for non-interactive mode + const mockProgram = { + nonInteractive: true, + project: "test-project", + bigQueryProject: "test-bq-project", + dataset: "test-dataset", + tableNamePrefix: "test-prefix", + schemaFiles: ["schema1.json", "schema2.json"], + outputHelp: jest.fn(), + }; + + const mockSchemas = { + schema1: { fields: { field1: { type: "string" } } }, + schema2: { fields: { field2: { type: "number" } } }, + }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (validateNonInteractiveParams as jest.Mock).mockReturnValue(true); + (readSchemas as jest.Mock).mockReturnValue(mockSchemas); + + const result = await parseConfig(); + + expect(parseProgram).toHaveBeenCalled(); + expect(validateNonInteractiveParams).toHaveBeenCalledWith(mockProgram); + expect(readSchemas).toHaveBeenCalledWith(mockProgram.schemaFiles); + expect(result).toEqual({ + projectId: "test-project", + bigQueryProjectId: "test-bq-project", + datasetId: "test-dataset", + tableNamePrefix: "test-prefix", + schemas: mockSchemas, + }); + }); + + it("should use project as bigQueryProject if not specified", async () => { + // Setup mocks with missing bigQueryProject + const mockProgram = { + nonInteractive: true, + project: "test-project", + bigQueryProject: undefined, + dataset: "test-dataset", + tableNamePrefix: "test-prefix", + schemaFiles: ["schema.json"], + outputHelp: jest.fn(), + }; + + const mockSchemas = { schema: { fields: { field: { type: "string" } } } }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (validateNonInteractiveParams as jest.Mock).mockReturnValue(true); + (readSchemas as jest.Mock).mockReturnValue(mockSchemas); + + const result = await parseConfig(); + + expect(result.bigQueryProjectId).toBe("test-project"); + }); + + it("should exit if required parameters are missing", async () => { + const mockProgram = { + nonInteractive: true, + outputHelp: jest.fn(), + }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (validateNonInteractiveParams as jest.Mock).mockReturnValue(false); + + await expect(parseConfig()).rejects.toThrow("Process exited with code 1"); + expect(mockProgram.outputHelp).toHaveBeenCalled(); + expect(mockExit).toHaveBeenCalledWith(1); + }); + }); + + describe("Interactive mode", () => { + it("should return CLI config from inquirer prompts", async () => { + // Setup mocks for interactive mode + const mockProgram = { + nonInteractive: false, + }; + + const mockPromptResponse = { + project: "interactive-project", + bigQueryProject: "interactive-bq-project", + dataset: "interactive-dataset", + tableNamePrefix: "interactive-prefix", + schemaFiles: "schema1.json, schema2.json", + }; + + const mockSchemas = { + schema1: { fields: { field1: { type: "string" } } }, + schema2: { fields: { field2: { type: "number" } } }, + }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (promptInquirer as jest.Mock).mockResolvedValue(mockPromptResponse); + (readSchemas as jest.Mock).mockReturnValue(mockSchemas); + + const result = await parseConfig(); + + expect(parseProgram).toHaveBeenCalled(); + expect(promptInquirer).toHaveBeenCalled(); + expect(readSchemas).toHaveBeenCalledWith([ + "schema1.json", + "schema2.json", + ]); + expect(result).toEqual({ + projectId: "interactive-project", + bigQueryProjectId: "interactive-bq-project", + datasetId: "interactive-dataset", + tableNamePrefix: "interactive-prefix", + schemas: mockSchemas, + }); + }); + + it("should properly trim and split schema file paths", async () => { + const mockProgram = { + nonInteractive: false, + }; + + const mockPromptResponse = { + project: "test-project", + bigQueryProject: "test-bq-project", + dataset: "test-dataset", + tableNamePrefix: "test-prefix", + schemaFiles: " schema1.json, schema2.json , schema3.json", + }; + + (parseProgram as jest.Mock).mockReturnValue(mockProgram); + (promptInquirer as jest.Mock).mockResolvedValue(mockPromptResponse); + (readSchemas as jest.Mock).mockReturnValue({}); + + await parseConfig(); + + // Verify that file paths are properly trimmed and split + expect(readSchemas).toHaveBeenCalledWith([ + "schema1.json", + "schema2.json", + "schema3.json", + ]); + }); + }); +}); diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts new file mode 100644 index 000000000..f7c61898d --- /dev/null +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts @@ -0,0 +1,66 @@ +import { FirestoreSchema } from "../schema"; +import { readSchemas } from "../schema-loader-utils"; +import { promptInquirer } from "./interactive"; +import { parseProgram, validateNonInteractiveParams } from "./non-interactive"; + +const DEFAULT_SAMPLE_SIZE = 100; + +interface CliConfig { + projectId: string; + bigQueryProjectId: string; + datasetId: string; + tableNamePrefix: string; + // TODO: isn't this the same as tableNamePrefix? check. + collectionPath?: string; + schemas: { [schemaName: string]: FirestoreSchema }; + useGemini?: boolean; + agentSampleSize?: number; + googleAiKey?: string; +} + +export async function parseConfig(): Promise { + const program = parseProgram(); + if (program.nonInteractive) { + if (!validateNonInteractiveParams(program)) { + program.outputHelp(); + process.exit(1); + } + + return { + projectId: program.project, + bigQueryProjectId: program.bigQueryProject || program.project, + datasetId: program.dataset, + tableNamePrefix: program.tableNamePrefix, + collectionPath: program.collectionPath, + schemas: readSchemas(program.schemaFiles), + useGemini: program.useGemini, + agentSampleSize: DEFAULT_SAMPLE_SIZE, + googleAiKey: program.googleAiKey, + }; + } + const { + project, + bigQueryProject, + dataset, + tableNamePrefix, + schemaFiles, + collectionPath, + useGemini, + // TODO: rename? + googleAiKey, + } = await promptInquirer(); + + return { + projectId: project, + bigQueryProjectId: bigQueryProject, + datasetId: dataset, + tableNamePrefix: tableNamePrefix, + collectionPath: collectionPath, + schemas: readSchemas( + schemaFiles.split(",").map((schemaFileName) => schemaFileName.trim()) + ), + useGemini: useGemini, + agentSampleSize: DEFAULT_SAMPLE_SIZE, + googleAiKey: googleAiKey, + }; +} diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts new file mode 100644 index 000000000..575b85cc8 --- /dev/null +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts @@ -0,0 +1,97 @@ +import inquirer from "inquirer"; + +const BIGQUERY_VALID_CHARACTERS = /^[a-zA-Z0-9_]+$/; +const FIRESTORE_VALID_CHARACTERS = /^[^\/]+$/; +const GCP_PROJECT_VALID_CHARACTERS = /^[a-z][a-z0-9-]{0,29}$/; + +const validateInput = (value: any, name: string, regex: RegExp) => { + if (!value || value === "" || value.trim() === "") { + return `Please supply a ${name}`; + } + if (!value.match(regex)) { + return `The ${name} must only contain letters or spaces`; + } + return true; +}; + +export const questions = [ + { + message: "What is your Firebase project ID?", + name: "project", + default: process.env.PROJECT_ID, + type: "input", + validate: (value) => + validateInput(value, "project ID", FIRESTORE_VALID_CHARACTERS), + }, + { + message: + "What is your Google Cloud Project ID for BigQuery? (can be the same as the Firebase project ID)", + name: "bigQueryProject", + default: process.env.PROJECT_ID, + type: "input", + validate: (value) => + validateInput(value, "BigQuery project ID", GCP_PROJECT_VALID_CHARACTERS), + }, + { + message: + "What is the ID of the BigQuery dataset the raw changelog lives in? (The dataset and the raw changelog must already exist!)", + name: "dataset", + type: "input", + validate: (value) => + validateInput(value, "dataset ID", BIGQUERY_VALID_CHARACTERS), + }, + { + message: + "What is the name of the Cloud Firestore collection for which you want to generate a schema view?", + name: "tableNamePrefix", + type: "input", + validate: (value) => + validateInput(value, "table name prefix", BIGQUERY_VALID_CHARACTERS), + }, + { + message: + "Where should this script look for schema definitions? (Enter a comma-separated list of, optionally globbed, paths to files or directories).", + name: "schemaFiles", + type: "input", + }, + { + message: + "Would you like to use a Gemini to automatically analyze your data and generate a draft schema?", + name: "useGemini", + type: "confirm", + default: false, + }, + // TODO: I dont think this is required as we have it above + // TODO: can we make the questions conditional? if we select useGemini then dont ask about finding schema files? + { + message: "What is the Firestore collection path you want to analyze?", + name: "collectionPath", + type: "input", + when: (answers) => answers.useGemini, + validate: (value) => + validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS), + }, + { + message: "Please provide your Google AI API Key:", + name: "googleAiKey", + type: "password", + when: (answers) => answers.useGemini, + validate: (value) => { + if (!value || value.trim() === "") { + return "Google AI API Key is required"; + } + return true; + }, + }, + { + message: "Where should the generated schema files be stored?", + name: "schemaDirectory", + type: "input", + when: (answers) => answers.useGemini, + default: "./schemas", + }, +]; + +export const promptInquirer = () => { + return inquirer.prompt(questions); +}; diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts new file mode 100644 index 000000000..0b779038d --- /dev/null +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts @@ -0,0 +1,86 @@ +import * as program from "commander"; + +/** + * Helper function to collect multiple values for an option into an array + */ +export function collect(value: string, previous: string[]): string[] { + return previous.concat([value]); +} + +/** + * Configure the commander program with all needed options + */ +export const configureProgram = () => { + const packageJson = require("../../package.json"); + + program + .name("gen-schema-views") + .description(packageJson.description) + .version(packageJson.version) + .option( + "--non-interactive", + "Parse all input from command line flags instead of prompting the caller.", + false + ) + .option( + "-P, --project ", + "Firebase Project ID for project containing Cloud Firestore database." + ) + .option( + "-B, --big-query-project ", + "Google Cloud Project ID for BigQuery (can be the same as the Firebase project ID)." + ) + .option( + "-d, --dataset ", + "The ID of the BigQuery dataset containing a raw Cloud Firestore document changelog." + ) + .option( + "-t, --table-name-prefix ", + "A common prefix for the names of all views generated by this script." + ) + .option( + "-f, --schema-files ", + "A collection of files from which to read schemas.", + collect, + [] + ) + .option( + "-g, --use-gemini", + "Use Gemini to automatically analyze your data and generate a draft schema. You will have a chance to manually view and approve this schema before it is used.", + false + ) + .option( + "-c, --collection-path ", + "Firestore collection path for Gemini to analyze" + ) + .option( + "--schema-dir ", + "Directory to store generated schemas", + "./schemas" + ) + .option("--google-ai-key ", "Google AI API Key for Gemini"); + + return program; +}; + +/** + * Parse command line arguments + */ +export const parseProgram = () => { + const prog = configureProgram(); + prog.parse(process.argv); + return prog; +}; + +/** + * Validate required non-interactive parameters are present + * @returns {boolean} true if all required parameters are present + */ +export const validateNonInteractiveParams = (program: any): boolean => { + return !( + program.project === undefined || + program.dataset === undefined || + program.tableNamePrefix === undefined || + program.schemaFiles.length === 0 + ); +}; diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts index 56ede848a..5ac475afd 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts @@ -16,32 +16,12 @@ * limitations under the License. */ -import program = require("commander"); import firebase = require("firebase-admin"); import inquirer from "inquirer"; - import { FirestoreBigQuerySchemaViewFactory, FirestoreSchema } from "./schema"; import { readSchemas } from "./schema-loader-utils"; import { runAgent } from "./schema/genkit"; - -const BIGQUERY_VALID_CHARACTERS = /^[a-zA-Z0-9_]+$/; -const FIRESTORE_VALID_CHARACTERS = /^[^\/]+$/; -const GCP_PROJECT_VALID_CHARACTERS = /^[a-z][a-z0-9-]{0,29}$/; -const DEFAULT_SAMPLE_SIZE = 100; - -const validateInput = (value: any, name: string, regex: RegExp) => { - if (!value || value === "" || value.trim() === "") { - return `Please supply a ${name}`; - } - if (!value.match(regex)) { - return `The ${name} must only contain letters or spaces`; - } - return true; -}; - -function collect(value, previous) { - return previous.concat([value]); -} +import { parseConfig } from "./config"; export async function sampleFirestoreDocuments( collectionPath: string, @@ -103,147 +83,8 @@ function serializeDocument(data: any): any { return data; } -const packageJson = require("../package.json"); - -program - .name("gen-schema-views") - .description(packageJson.description) - .version(packageJson.version) - .option( - "--non-interactive", - "Parse all input from command line flags instead of prompting the caller.", - false - ) - .option( - "-P, --project ", - "Firebase Project ID for project containing Cloud Firestore database." - ) - .option( - "-B, --big-query-project ", - "Google Cloud Project ID for BigQuery (can be the same as the Firebase project ID)." - ) - .option( - "-d, --dataset ", - "The ID of the BigQuery dataset containing a raw Cloud Firestore document changelog." - ) - .option( - "-t, --table-name-prefix ", - "A common prefix for the names of all views generated by this script." - ) - .option( - "-c, --collection-path ", - "Firestore collection path for Gemini to analyze" - ) - .option( - "-f, --schema-files ", - "A collection of files from which to read schemas.", - collect, - [] - ) - .option( - "--use-gemini", - "Use Gemini to automatically analyze your data and generate a draft schema. You will have a chance to manually view and approve this schema before it is used.", - false - ) - .option( - "--schema-dir ", - "Directory to store generated schemas", - "./schemas" - ) - .option("--google-ai-key ", "Google AI API Key for Gemini"); - -const questions = [ - { - message: "What is your Firebase project ID?", - name: "project", - default: process.env.PROJECT_ID, - type: "input", - validate: (value) => - validateInput(value, "project ID", FIRESTORE_VALID_CHARACTERS), - }, - { - message: - "What is your Google Cloud Project ID for BigQuery? (can be the same as the Firebase project ID)", - name: "bigQueryProject", - default: process.env.PROJECT_ID, - type: "input", - validate: (value) => - validateInput(value, "BigQuery project ID", GCP_PROJECT_VALID_CHARACTERS), - }, - { - message: - "What is the ID of the BigQuery dataset the raw changelog lives in? (The dataset and the raw changelog must already exist!)", - name: "dataset", - type: "input", - validate: (value) => - validateInput(value, "dataset ID", BIGQUERY_VALID_CHARACTERS), - }, - { - message: - "What is the table name prefix for which you want to generate a schema view?", - name: "tableNamePrefix", - type: "input", - validate: (value) => - validateInput(value, "table name prefix", BIGQUERY_VALID_CHARACTERS), - }, - { - message: - "Would you like to use a Gemini to automatically analyze your data and generate a draft schema?", - name: "useGemini", - type: "confirm", - default: false, - }, - { - message: "What is the Firestore collection path you want to analyze?", - name: "collectionPath", - type: "input", - when: (answers) => answers.useGemini, - validate: (value) => - validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS), - }, - { - message: "Please provide your Google AI API Key:", - name: "googleAiKey", - type: "password", - when: (answers) => answers.useGemini, - validate: (value) => { - if (!value || value.trim() === "") { - return "Google AI API Key is required"; - } - return true; - }, - }, - { - message: - "Where should this script look for schema definitions? (Enter a comma-separated list of, optionally globbed, paths to files or directories).", - name: "schemaFiles", - type: "input", - when: (answers) => !answers.useGemini, - }, - { - message: "Where should the generated schema files be stored?", - name: "schemaDirectory", - type: "input", - when: (answers) => answers.useGemini, - default: "./schemas", - }, -]; - -interface CliConfig { - projectId: string; - bigQueryProjectId: string; - datasetId: string; - tableNamePrefix: string; - collectionPath?: string; - schemaDirectory?: string; - schemas: { [schemaName: string]: FirestoreSchema }; - useGemini?: boolean; - agentSampleSize?: number; - googleAiKey?: string; -} - async function run(): Promise { - const config: CliConfig = await parseConfig(); + const config = await parseConfig(); process.env.PROJECT_ID = config.projectId; process.env.GOOGLE_CLOUD_PROJECT = config.bigQueryProjectId; @@ -260,6 +101,7 @@ async function run(): Promise { ); if (config.useGemini) { + // TODO: move to genkit subdirectory try { const sampleData = await sampleFirestoreDocuments( config.collectionPath!, @@ -267,7 +109,8 @@ async function run(): Promise { ); const chat = runAgent( config.googleAiKey!, - config.schemaDirectory || "./schemas", + // TODO: set this somehow from user input + "./schemas", config.tableNamePrefix, config.collectionPath!, sampleData @@ -341,69 +184,6 @@ async function run(): Promise { return 0; } -async function parseConfig(): Promise { - program.parse(process.argv); - if (program.nonInteractive) { - if ( - !program.useGemini && - (program.project === undefined || - program.bigQueryProject === undefined || - program.dataset === undefined || - program.tableNamePrefix === undefined || - program.schemaFiles.length === 0) - ) { - program.outputHelp(); - process.exit(1); - } - - if (program.useGemini) { - if (!program.googleAiKey) { - console.error( - "Google AI API Key is required when using the Gemini Agent" - ); - process.exit(1); - } - - if (!program.collectionPath) { - console.error( - "Collection path is required when using the Gemini Agent" - ); - process.exit(1); - } - } - - return { - projectId: program.project, - bigQueryProjectId: program.bigQueryProject, - datasetId: program.dataset, - tableNamePrefix: program.tableNamePrefix, - collectionPath: program.collectionPath, - schemas: program.useGemini ? {} : readSchemas(program.schemaFiles), - useGemini: program.useGemini, - agentSampleSize: DEFAULT_SAMPLE_SIZE, - googleAiKey: program.googleAiKey, - }; - } - const answers = await inquirer.prompt(questions); - - return { - projectId: answers.project, - bigQueryProjectId: answers.bigQueryProject, - datasetId: answers.dataset, - tableNamePrefix: answers.tableNamePrefix, - collectionPath: answers.collectionPath, - schemas: answers.useGemini - ? {} - : readSchemas( - answers.schemaFiles - .split(",") - .map((schemaFileName) => schemaFileName.trim()) - ), - useGemini: answers.useGemini, - agentSampleSize: DEFAULT_SAMPLE_SIZE, - googleAiKey: answers.googleAiKey, - }; -} if (process.env.NODE_ENV !== "test") { run() .then((result) => { diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts index f73acce29..7079480c8 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts @@ -1,4 +1,4 @@ -import { gemini20FlashExp, googleAI } from "@genkit-ai/googleai"; +import { gemini20Flash, googleAI } from "@genkit-ai/googleai"; import { Genkit, genkit, z } from "genkit"; import * as fs from "fs/promises"; import * as path from "path"; @@ -115,7 +115,7 @@ const defineSchemaAgent = ( name: "schemaAgent", description: "Agent for managing BigQuery schema files", tools: [writeSchemaTool], - model: gemini20FlashExp, + model: gemini20Flash, }, ` You are a Schema Management Agent for Generating BigQuery schemas from Firestore Collections. @@ -237,5 +237,6 @@ export const runAgent = ( tablePrefix, sampleData ); + return ai.chat(schemaAgent); };