From a9165cae3aa990be2b93729bdf5554f8b9174cd6 Mon Sep 17 00:00:00 2001
From: Corie Watson <watson.corie@gmail.com>
Date: Wed, 5 Mar 2025 23:44:44 +0000
Subject: [PATCH] WIP

---
 .../src/__tests__/config/index.test.ts        | 76 +++++++++++++++-
 .../gen-schema-view/src/config/index.ts       | 15 ++-
 .../gen-schema-view/src/config/interactive.ts | 30 +++---
 .../src/config/non-interactive.ts             |  2 +-
 .../gen-schema-view/src/schema/genkit.ts      | 91 +++++++++++--------
 5 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts
index cba71b221..4e416aa62 100644
--- a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts
+++ b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts
@@ -89,6 +89,33 @@ describe("parseConfig", () => {
       expect(result.bigQueryProjectId).toBe("test-project");
     });
 
+    it("should use gemini if specified", async () => { // TODO: This test needs completed
+      // Setup mocks with useGemini = true
+      const mockProgram = {
+        nonInteractive: true,
+        project: "test-project",
+        bigQueryProject: "test-bq-project",
+        dataset: "test-dataset",
+        tableNamePrefix: "test-prefix",
+        schemaFiles: ["schema.json"],
+        useGemini: true,
+        googleAiKey: "test-key",
+        geminiAnalyzeCollectionPath: "test-collection",
+        schemaDirectory: "test-directory",
+        outputHelp: jest.fn(),
+      };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (validateNonInteractiveParams as jest.Mock).mockReturnValue(true);
+
+      const result = await parseConfig();
+
+      expect(result.useGemini).toBe(true);
+      expect(result.googleAiKey).toBe("test-key");
+      expect(result.geminiAnalyzeCollectionPath).toBe("test-collection");
+      expect(result.schemaDirectory).toBe("test-directory");
+    });
+
     it("should exit if required parameters are missing", async () => {
       const mockProgram = {
         nonInteractive: true,
@@ -104,7 +131,7 @@ describe("parseConfig", () => {
     });
   });
 
-  describe("Interactive mode", () => {
+  describe("Interactive mode without Gemini", () => {
     it("should return CLI config from inquirer prompts", async () => {
       // Setup mocks for interactive mode
       const mockProgram = {
@@ -116,6 +143,7 @@ describe("parseConfig", () => {
         bigQueryProject: "interactive-bq-project",
         dataset: "interactive-dataset",
         tableNamePrefix: "interactive-prefix",
+        useGemini: false,
         schemaFiles: "schema1.json, schema2.json",
       };
 
@@ -155,6 +183,7 @@ describe("parseConfig", () => {
         bigQueryProject: "test-bq-project",
         dataset: "test-dataset",
         tableNamePrefix: "test-prefix",
+        useGemini: false,
         schemaFiles: " schema1.json,  schema2.json , schema3.json",
       };
 
@@ -172,4 +201,49 @@ describe("parseConfig", () => {
       ]);
     });
   });
+
+  describe("Interactive mode with Gemini", () => { // TODO: This needs completed
+    it("should return CLI config from inquirer prompts", async () => {
+      // Setup mocks for interactive mode
+      const mockProgram = {
+        nonInteractive: false,
+      };
+
+      const mockPromptResponse = {
+        project: "interactive-project",
+        bigQueryProject: "interactive-bq-project",
+        dataset: "interactive-dataset",
+        tableNamePrefix: "interactive-prefix",
+        useGemini: true,
+        googleAiKey: "test-key",
+        geminiAnalyzeCollectionPath: "test-collection",
+        schemaDirectory: "test-directory",
+      };
+
+      const mockSchemas = {
+        schema1: { fields: { field1: { type: "string" } } },
+        schema2: { fields: { field2: { type: "number" } } },
+      };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (promptInquirer as jest.Mock).mockResolvedValue(mockPromptResponse);
+      (readSchemas as jest.Mock).mockReturnValue(mockSchemas);
+
+      const result = await parseConfig();
+
+      expect(parseProgram).toHaveBeenCalled();
+      expect(promptInquirer).toHaveBeenCalled();
+      expect(readSchemas).toHaveBeenCalledWith([
+        "schema1.json",
+        "schema2.json",
+      ]);
+      expect(result).toEqual({
+        projectId: "interactive-project",
+        bigQueryProjectId: "interactive-bq-project",
+        datasetId: "interactive-dataset",
+        tableNamePrefix: "interactive-prefix",
+        schemas: mockSchemas,
+      });
+    });
+  });
 });
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts
index c235fbf9b..a39d05bfe 100644
--- a/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts
+++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts
@@ -14,8 +14,10 @@ export interface CliConfig {
   tableNamePrefix: string;
   schemas: { [schemaName: string]: FirestoreSchema };
   useGemini?: boolean;
+  geminiAnalyzeCollectionPath?: string;
   agentSampleSize?: number;
   googleAiKey?: string;
+  schemaDirectory?: string;
 }
 
 export async function parseConfig(): Promise<CliConfig> {
@@ -33,8 +35,10 @@ export async function parseConfig(): Promise<CliConfig> {
       tableNamePrefix: program.tableNamePrefix,
       useGemini: program.useGemini,
       schemas: !program.useGemini ? readSchemas(program.schemaFiles) : {},
+      geminiAnalyzeCollectionPath: program.geminiAnalyzeCollectionPath,
       agentSampleSize: DEFAULT_SAMPLE_SIZE,
       googleAiKey: program.googleAiKey,
+      schemaDirectory: program.schemaDirectory,
     };
   }
   const {
@@ -44,20 +48,23 @@ export async function parseConfig(): Promise<CliConfig> {
     tableNamePrefix,
     schemaFiles,
     useGemini,
-    // TODO: rename?
+    geminiAnalyzeCollectionPath,
     googleAiKey,
+    schemaDirectory,
   } = await promptInquirer();
 
   return {
     projectId: project,
     bigQueryProjectId: bigQueryProject,
     datasetId: dataset,
-    tableNamePrefix: tableNamePrefix,
+    tableNamePrefix,
     schemas: !useGemini ? readSchemas(
       schemaFiles.split(",").map((schemaFileName) => schemaFileName.trim())
     ) : {},
-    useGemini: useGemini,
+    useGemini,
+    geminiAnalyzeCollectionPath,
     agentSampleSize: DEFAULT_SAMPLE_SIZE,
-    googleAiKey: googleAiKey,
+    googleAiKey,
+    schemaDirectory,
   };
 }
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts
index 8ff5de796..c532d8704 100644
--- a/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts
+++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts
@@ -18,7 +18,7 @@ export const questions = [
   {
     message: "What is your Firebase project ID?",
     name: "project",
-    default: process.env.PROJECT_ID,
+    default: "dev-extensions-testing",
     type: "input",
     validate: (value) =>
       validateInput(value, "project ID", FIRESTORE_VALID_CHARACTERS),
@@ -27,7 +27,7 @@ export const questions = [
     message:
       "What is your Google Cloud Project ID for BigQuery? (can be the same as the Firebase project ID)",
     name: "bigQueryProject",
-    default: process.env.PROJECT_ID,
+    default: "dev-extensions-testing",
     type: "input",
     validate: (value) =>
       validateInput(value, "BigQuery project ID", GCP_PROJECT_VALID_CHARACTERS),
@@ -37,23 +37,26 @@ export const questions = [
       "What is the ID of the BigQuery dataset the raw changelog lives in? (The dataset and the raw changelog must already exist!)",
     name: "dataset",
     type: "input",
+    default: "2025_stress_test",
     validate: (value) =>
       validateInput(value, "dataset ID", BIGQUERY_VALID_CHARACTERS),
   },
   {
     message:
-      "What is the name of the Cloud Firestore collection for which you want to generate a schema view?",
+      "What prefix should be used for the names of the views generated by this script?",
     name: "tableNamePrefix",
     type: "input",
+    default: "2025_stress_test",
     validate: (value) =>
       validateInput(value, "table name prefix", BIGQUERY_VALID_CHARACTERS),
+    requiredOption: false,
   },
   {
     message:
       "Would you like to use a Gemini to automatically analyze your data and generate a draft schema?",
     name: "useGemini",
     type: "confirm",
-    default: false,
+    default: true,
   },
   {
     message:
@@ -62,20 +65,11 @@ export const questions = [
     type: "input",
     when: (answers) => !answers.useGemini,
   },
-  // TODO: I dont think this is required as we have it above
-  // TODO: can we make the questions conditional? if we select useGemini then dont ask about finding schema files?
-  // {
-  //   message: "What is the Firestore collection path you want to analyze?",
-  //   name: "collectionPath",
-  //   type: "input",
-  //   when: (answers) => answers.useGemini,
-  //   validate: (value) =>
-  //     validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS),
-  // },
   {
     message: "Please provide your Google AI API Key:",
     name: "googleAiKey",
     type: "password",
+    default: "AIzaSyAv_SeZkZCo_qVjrysxxtasHf6sN5yG9wg",
     when: (answers) => answers.useGemini,
     validate: (value) => {
       if (!value || value.trim() === "") {
@@ -84,6 +78,14 @@ export const questions = [
       return true;
     },
   },
+  {
+    message: "What is the Firestore collection path you want Gemini to analyze?",
+    name: "geminiAnalyzeCollectionPath",
+    type: "input",
+    when: (answers) => answers.useGemini,
+    validate: (value) =>
+      validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS),
+  },
   {
     message: "Where should the generated schema files be stored?",
     name: "schemaDirectory",
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts
index ab35af9a8..1af2b2384 100644
--- a/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts
+++ b/firestore-bigquery-export/scripts/gen-schema-view/src/config/non-interactive.ts
@@ -50,7 +50,7 @@ export const configureProgram = () => {
       false
     )
     .option(
-      "-c, --collection-path <path>",
+      "-c, --gemini-analyze-collection-path <path>",
       "Firestore collection path for Gemini to analyze"
     )
     .option(
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts
index ee1d0f804..9b35dc715 100644
--- a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts
+++ b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts
@@ -2,10 +2,9 @@ import type { CliConfig } from "../config";
 import firebase = require("firebase-admin");
 import { genkit, z } from "genkit";
 import { googleAI, gemini20Flash } from "@genkit-ai/googleai";
-import * as fs from "fs/promises";
+import * as fs from "fs";
 import * as path from "path";
 import inquirer from "inquirer";
-import {SchemaSchema} from './genkitSchema'
 
 export async function sampleFirestoreDocuments(
   collectionPath: string,
@@ -25,7 +24,6 @@ export async function sampleFirestoreDocuments(
       return serializeDocument(data);
     });
 
-    console.log(`Successfully sampled ${documents.length} documents.`);
     return documents;
   } catch (error) {
     console.error("Error sampling documents:", error);
@@ -67,44 +65,19 @@ function serializeDocument(data: any): any {
   return data;
 }
 
-/**
- * Writes a schema file to the specified directory if it does not already exist.
- *
- * @param {string} schemaDirectory - The directory where schema files are stored.
- * @param {string} fileName - The name of the schema file to write.
- * @param {string} content - The content of the schema file as a JSON string.
- * @returns {Promise<string>} - A message indicating success or an error if the file already exists.
- */
-const writeSchemaFile = async (
-  schemaDirectory: string,
-  fileName: string,
-  content: string
-): Promise<string> => {
-  const filePath = path.join(schemaDirectory, fileName);
-  try {
-    await fs.access(filePath);
-    return "Error: Schema file already exists";
-  } catch {
-    await fs.writeFile(filePath, content);
-    return "Schema created successfully";
-  }
-};
-
 const biqquerySchemaPrompt = ({
-  collectionName,
+  collectionPath,
   sampleData,
-  tablePrefix,
 }: {
-  collectionName: string;
+  collectionPath: string;
   sampleData: any[];
-  tablePrefix: string;
 }) => `
     You are a Schema Management Agent for Generating BigQuery schemas from Firestore Collections. 
     Your primary tasks are:
     1. Analyze the provided sample documents
     2. Generate an appropriate BigQuery schema
   
-    I will provide you with sample documents from the collection "${collectionName}".
+    I will provide you with sample documents from the collection "${collectionPath}".
   
     Here are the sample documents to analyze:
     ${JSON.stringify(sampleData, null, 2)}
@@ -194,14 +167,19 @@ const biqquerySchemaPrompt = ({
 export const generateSchemaFilesWithGemini = async (config: CliConfig) => {
   //  get sample data from Firestore
   const sampleData = await sampleFirestoreDocuments(
-    config.tableNamePrefix!,
+    config.geminiAnalyzeCollectionPath!,
     config.agentSampleSize!
   );
 
+  if (sampleData.length === 0) {
+    console.log("Operation cancelled. No sample data found. Either the collection is empty or the collection path is incorrect.");
+    process.exit(0);
+  }
+  console.log(`Successfully sampled ${sampleData.length} documents from collection ${config.geminiAnalyzeCollectionPath}`);
+
   const prompt = biqquerySchemaPrompt({
-    collectionName: config.tableNamePrefix!,
+    collectionPath: config.geminiAnalyzeCollectionPath!,
     sampleData,
-    tablePrefix: config.tableNamePrefix,
   });
 
   // initialize genkit with googleAI plugin
@@ -218,12 +196,49 @@ export const generateSchemaFilesWithGemini = async (config: CliConfig) => {
     model: gemini20Flash,
     prompt,
     output: {
-      format: 'json',
-      schema: SchemaSchema
+      format: "json",
+      schema: z.object({
+        fields: z.array(z.object({
+          name: z.string(),
+          type: z.string(),
+          description: z.string(),
+          fields: z.array(z.object({
+            name: z.string(),
+            type: z.string(),
+            description: z.string(),
+            fields: z.array(z.object({
+              name: z.string(),
+              type: z.string(),
+              description: z.string(),
+              column_name: z.string().optional(),
+            })),
+        })),
+      })),
+    })
+  }});
+
+  const filePath = path.join(config.schemaDirectory, `${config.tableNamePrefix}.json`);
+
+  // Check if a file exists
+  if (fs.existsSync(filePath)) {
+    const overwriteConfirm = await inquirer.prompt([
+      {
+        type: "confirm",
+        name: "proceed",
+        message:
+          "Schema file already exists. Would you like to overwrite it?",
+        default: false,
+      },
+    ]);
+
+    if (!overwriteConfirm.proceed) {
+      console.log("Operation cancelled. Please choose a different schema file name.");
+      process.exit(0);
     }
-  });
 
-  await writeSchemaFile("./schemas", `${config.tableNamePrefix}.json`, text);
+    await fs.promises.writeFile(filePath, text);
+  }
+
   // confirm with user that schema file is correct
   const confirmation = await inquirer.prompt([
     {