refactor(gen-schema-view): extract config parsing to their own module…

…s and update gemini model import
firebase · Mar 4, 2025 · 1d279cf · 1d279cf
1 parent 39e704e
commit 1d279cf
Show file tree

Hide file tree

Showing 6 changed files with 432 additions and 227 deletions.
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/config/index.test.ts
@@ -0,0 +1,175 @@
+import { parseConfig } from "../../../src/config";
+import { promptInquirer } from "../../../src/config/interactive";
+import {
+  parseProgram,
+  validateNonInteractiveParams,
+} from "../../../src/config/non-interactive";
+import { readSchemas } from "../../../src/schema-loader-utils";
+
+// Mock dependencies
+jest.mock("../../../src/config/interactive", () => ({
+  promptInquirer: jest.fn(),
+}));
+
+jest.mock("../../../src/config/non-interactive", () => ({
+  parseProgram: jest.fn(),
+  validateNonInteractiveParams: jest.fn(),
+}));
+
+jest.mock("../../../src/schema-loader-utils", () => ({
+  readSchemas: jest.fn(),
+}));
+
+// Mock process.exit to prevent tests from actually exiting
+const mockExit = jest.spyOn(process, "exit").mockImplementation((code) => {
+  throw new Error(`Process exited with code ${code}`);
+});
+
+describe("parseConfig", () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+
+  describe("Non-interactive mode", () => {
+    it("should return CLI config from command line arguments", async () => {
+      // Setup mocks for non-interactive mode
+      const mockProgram = {
+        nonInteractive: true,
+        project: "test-project",
+        bigQueryProject: "test-bq-project",
+        dataset: "test-dataset",
+        tableNamePrefix: "test-prefix",
+        schemaFiles: ["schema1.json", "schema2.json"],
+        outputHelp: jest.fn(),
+      };
+
+      const mockSchemas = {
+        schema1: { fields: { field1: { type: "string" } } },
+        schema2: { fields: { field2: { type: "number" } } },
+      };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (validateNonInteractiveParams as jest.Mock).mockReturnValue(true);
+      (readSchemas as jest.Mock).mockReturnValue(mockSchemas);
+
+      const result = await parseConfig();
+
+      expect(parseProgram).toHaveBeenCalled();
+      expect(validateNonInteractiveParams).toHaveBeenCalledWith(mockProgram);
+      expect(readSchemas).toHaveBeenCalledWith(mockProgram.schemaFiles);
+      expect(result).toEqual({
+        projectId: "test-project",
+        bigQueryProjectId: "test-bq-project",
+        datasetId: "test-dataset",
+        tableNamePrefix: "test-prefix",
+        schemas: mockSchemas,
+      });
+    });
+
+    it("should use project as bigQueryProject if not specified", async () => {
+      // Setup mocks with missing bigQueryProject
+      const mockProgram = {
+        nonInteractive: true,
+        project: "test-project",
+        bigQueryProject: undefined,
+        dataset: "test-dataset",
+        tableNamePrefix: "test-prefix",
+        schemaFiles: ["schema.json"],
+        outputHelp: jest.fn(),
+      };
+
+      const mockSchemas = { schema: { fields: { field: { type: "string" } } } };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (validateNonInteractiveParams as jest.Mock).mockReturnValue(true);
+      (readSchemas as jest.Mock).mockReturnValue(mockSchemas);
+
+      const result = await parseConfig();
+
+      expect(result.bigQueryProjectId).toBe("test-project");
+    });
+
+    it("should exit if required parameters are missing", async () => {
+      const mockProgram = {
+        nonInteractive: true,
+        outputHelp: jest.fn(),
+      };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (validateNonInteractiveParams as jest.Mock).mockReturnValue(false);
+
+      await expect(parseConfig()).rejects.toThrow("Process exited with code 1");
+      expect(mockProgram.outputHelp).toHaveBeenCalled();
+      expect(mockExit).toHaveBeenCalledWith(1);
+    });
+  });
+
+  describe("Interactive mode", () => {
+    it("should return CLI config from inquirer prompts", async () => {
+      // Setup mocks for interactive mode
+      const mockProgram = {
+        nonInteractive: false,
+      };
+
+      const mockPromptResponse = {
+        project: "interactive-project",
+        bigQueryProject: "interactive-bq-project",
+        dataset: "interactive-dataset",
+        tableNamePrefix: "interactive-prefix",
+        schemaFiles: "schema1.json, schema2.json",
+      };
+
+      const mockSchemas = {
+        schema1: { fields: { field1: { type: "string" } } },
+        schema2: { fields: { field2: { type: "number" } } },
+      };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (promptInquirer as jest.Mock).mockResolvedValue(mockPromptResponse);
+      (readSchemas as jest.Mock).mockReturnValue(mockSchemas);
+
+      const result = await parseConfig();
+
+      expect(parseProgram).toHaveBeenCalled();
+      expect(promptInquirer).toHaveBeenCalled();
+      expect(readSchemas).toHaveBeenCalledWith([
+        "schema1.json",
+        "schema2.json",
+      ]);
+      expect(result).toEqual({
+        projectId: "interactive-project",
+        bigQueryProjectId: "interactive-bq-project",
+        datasetId: "interactive-dataset",
+        tableNamePrefix: "interactive-prefix",
+        schemas: mockSchemas,
+      });
+    });
+
+    it("should properly trim and split schema file paths", async () => {
+      const mockProgram = {
+        nonInteractive: false,
+      };
+
+      const mockPromptResponse = {
+        project: "test-project",
+        bigQueryProject: "test-bq-project",
+        dataset: "test-dataset",
+        tableNamePrefix: "test-prefix",
+        schemaFiles: " schema1.json,  schema2.json , schema3.json",
+      };
+
+      (parseProgram as jest.Mock).mockReturnValue(mockProgram);
+      (promptInquirer as jest.Mock).mockResolvedValue(mockPromptResponse);
+      (readSchemas as jest.Mock).mockReturnValue({});
+
+      await parseConfig();
+
+      // Verify that file paths are properly trimmed and split
+      expect(readSchemas).toHaveBeenCalledWith([
+        "schema1.json",
+        "schema2.json",
+        "schema3.json",
+      ]);
+    });
+  });
+});
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/index.ts
@@ -0,0 +1,66 @@
+import { FirestoreSchema } from "../schema";
+import { readSchemas } from "../schema-loader-utils";
+import { promptInquirer } from "./interactive";
+import { parseProgram, validateNonInteractiveParams } from "./non-interactive";
+
+const DEFAULT_SAMPLE_SIZE = 100;
+
+interface CliConfig {
+  projectId: string;
+  bigQueryProjectId: string;
+  datasetId: string;
+  tableNamePrefix: string;
+  // TODO: isn't this the same as tableNamePrefix? check.
+  collectionPath?: string;
+  schemas: { [schemaName: string]: FirestoreSchema };
+  useGemini?: boolean;
+  agentSampleSize?: number;
+  googleAiKey?: string;
+}
+
+export async function parseConfig(): Promise<CliConfig> {
+  const program = parseProgram();
+  if (program.nonInteractive) {
+    if (!validateNonInteractiveParams(program)) {
+      program.outputHelp();
+      process.exit(1);
+    }
+
+    return {
+      projectId: program.project,
+      bigQueryProjectId: program.bigQueryProject || program.project,
+      datasetId: program.dataset,
+      tableNamePrefix: program.tableNamePrefix,
+      collectionPath: program.collectionPath,
+      schemas: readSchemas(program.schemaFiles),
+      useGemini: program.useGemini,
+      agentSampleSize: DEFAULT_SAMPLE_SIZE,
+      googleAiKey: program.googleAiKey,
+    };
+  }
+  const {
+    project,
+    bigQueryProject,
+    dataset,
+    tableNamePrefix,
+    schemaFiles,
+    collectionPath,
+    useGemini,
+    // TODO: rename?
+    googleAiKey,
+  } = await promptInquirer();
+
+  return {
+    projectId: project,
+    bigQueryProjectId: bigQueryProject,
+    datasetId: dataset,
+    tableNamePrefix: tableNamePrefix,
+    collectionPath: collectionPath,
+    schemas: readSchemas(
+      schemaFiles.split(",").map((schemaFileName) => schemaFileName.trim())
+    ),
+    useGemini: useGemini,
+    agentSampleSize: DEFAULT_SAMPLE_SIZE,
+    googleAiKey: googleAiKey,
+  };
+}
diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/config/interactive.ts
@@ -0,0 +1,97 @@
+import inquirer from "inquirer";
+
+const BIGQUERY_VALID_CHARACTERS = /^[a-zA-Z0-9_]+$/;
+const FIRESTORE_VALID_CHARACTERS = /^[^\/]+$/;
+const GCP_PROJECT_VALID_CHARACTERS = /^[a-z][a-z0-9-]{0,29}$/;
+
+const validateInput = (value: any, name: string, regex: RegExp) => {
+  if (!value || value === "" || value.trim() === "") {
+    return `Please supply a ${name}`;
+  }
+  if (!value.match(regex)) {
+    return `The ${name} must only contain letters or spaces`;
+  }
+  return true;
+};
+
+export const questions = [
+  {
+    message: "What is your Firebase project ID?",
+    name: "project",
+    default: process.env.PROJECT_ID,
+    type: "input",
+    validate: (value) =>
+      validateInput(value, "project ID", FIRESTORE_VALID_CHARACTERS),
+  },
+  {
+    message:
+      "What is your Google Cloud Project ID for BigQuery? (can be the same as the Firebase project ID)",
+    name: "bigQueryProject",
+    default: process.env.PROJECT_ID,
+    type: "input",
+    validate: (value) =>
+      validateInput(value, "BigQuery project ID", GCP_PROJECT_VALID_CHARACTERS),
+  },
+  {
+    message:
+      "What is the ID of the BigQuery dataset the raw changelog lives in? (The dataset and the raw changelog must already exist!)",
+    name: "dataset",
+    type: "input",
+    validate: (value) =>
+      validateInput(value, "dataset ID", BIGQUERY_VALID_CHARACTERS),
+  },
+  {
+    message:
+      "What is the name of the Cloud Firestore collection for which you want to generate a schema view?",
+    name: "tableNamePrefix",
+    type: "input",
+    validate: (value) =>
+      validateInput(value, "table name prefix", BIGQUERY_VALID_CHARACTERS),
+  },
+  {
+    message:
+      "Where should this script look for schema definitions? (Enter a comma-separated list of, optionally globbed, paths to files or directories).",
+    name: "schemaFiles",
+    type: "input",
+  },
+  {
+    message:
+      "Would you like to use a Gemini to automatically analyze your data and generate a draft schema?",
+    name: "useGemini",
+    type: "confirm",
+    default: false,
+  },
+  // TODO: I dont think this is required as we have it above
+  // TODO: can we make the questions conditional? if we select useGemini then dont ask about finding schema files?
+  {
+    message: "What is the Firestore collection path you want to analyze?",
+    name: "collectionPath",
+    type: "input",
+    when: (answers) => answers.useGemini,
+    validate: (value) =>
+      validateInput(value, "collection path", FIRESTORE_VALID_CHARACTERS),
+  },
+  {
+    message: "Please provide your Google AI API Key:",
+    name: "googleAiKey",
+    type: "password",
+    when: (answers) => answers.useGemini,
+    validate: (value) => {
+      if (!value || value.trim() === "") {
+        return "Google AI API Key is required";
+      }
+      return true;
+    },
+  },
+  {
+    message: "Where should the generated schema files be stored?",
+    name: "schemaDirectory",
+    type: "input",
+    when: (answers) => answers.useGemini,
+    default: "./schemas",
+  },
+];
+
+export const promptInquirer = () => {
+  return inquirer.prompt(questions);
+};