Skip to content

Commit

Permalink
cli-pull-entries
Browse files Browse the repository at this point in the history
  • Loading branch information
MadDataScience committed Feb 28, 2025
1 parent e97f0e9 commit 398099c
Show file tree
Hide file tree
Showing 6 changed files with 337 additions and 2 deletions.
108 changes: 108 additions & 0 deletions src/cli-pull-entries.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env node
import uniq from 'lodash/uniq';

import yargs from 'yargs-parser';
import { logger } from './logger';
import colors from 'colors';
import { buildTranscendGraphQLClient } from './graphql';
import { DEFAULT_TRANSCEND_API } from './constants';
import { pullUnstructuredSubDataPointRecommendations } from './data-inventory';
import { writeCsv } from './cron';
import { splitCsvToList } from './requests';
import { DataCategoryType } from '@transcend-io/privacy-types';

/**
* Sync entries from Transcend inventory to a CSV
*
* Dev Usage:
* yarn ts-node ./src/cli-pull-entries.ts --auth=$TRANSCEND_API_KEY
*
* Standard usage
* yarn cli-pull-entries --auth=$TRANSCEND_API_KEY
*/
async function main(): Promise<void> {
// Parse command line arguments
const {
file = './entries.csv',
transcendUrl = DEFAULT_TRANSCEND_API,
auth,
dataSiloIds = '',
includeGuessedCategories = 'false',
parentCategories = '',
subCategories = '',
} = yargs(process.argv.slice(2));

// Ensure auth is passed
if (!auth) {
logger.error(
colors.red(
'A Transcend API key must be provided. You can specify using --auth=$TRANSCEND_API_KEY',
),
);
process.exit(1);
}

// Validate trackerStatuses
const parsedParentCategories = splitCsvToList(
parentCategories,
) as DataCategoryType[];
const invalidParentCategories = parsedParentCategories.filter(
(type) => !Object.values(DataCategoryType).includes(type),
);
if (invalidParentCategories.length > 0) {
logger.error(
colors.red(
`Failed to parse parentCategories:"${invalidParentCategories.join(
',',
)}".\n` +
`Expected one of: \n${Object.values(DataCategoryType).join('\n')}`,
),
);
process.exit(1);
}

try {
// Create a GraphQL client
const client = buildTranscendGraphQLClient(transcendUrl, auth);

const entries = await pullUnstructuredSubDataPointRecommendations(client, {
dataSiloIds: splitCsvToList(dataSiloIds),
includeGuessedCategories: includeGuessedCategories === 'true',
parentCategories: parsedParentCategories,
subCategories: splitCsvToList(subCategories), // TODO: https://transcend.height.app/T-40482 - do by name not ID
});

logger.info(colors.magenta(`Writing entries to file "${file}"...`));
let headers: string[] = [];
const inputs = entries.map((entry) => {
const result = {
'Property ID': entry.id,
'Data Silo': entry.dataSiloId, // FIXME
Object: entry.scannedObjectId, // FIXME
'Object Path': entry.scannedObjectPathId, // FIXME
Property: entry.name,
'Data Categories': entry.categories
.map((category) => `${category.category}:${category.name}`)
.join(', '),
// 'Guessed Category': entry.pendingCategoryGuesses?.[0]
// ? `${entry.pendingCategoryGuesses![0]!.category.category}:${
// entry.pendingCategoryGuesses![0]!.category.name
// }`
// : '',
};
headers = uniq([...headers, ...Object.keys(result)]);
return result;
});
writeCsv(file, inputs, headers);
} catch (err) {
logger.error(
colors.red(`An error occurred syncing the entries: ${err.message}`),
);
process.exit(1);
}

// Indicate success
logger.info(colors.green(`Successfully synced entries to disk at ${file}!`));
}

main();
30 changes: 29 additions & 1 deletion src/codecs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ import {
} from '@transcend-io/airgap.js-types';
import { buildEnabledRouteType } from './helpers/buildEnabledRouteType';
import { buildAIIntegrationType } from './helpers/buildAIIntegrationType';
import { OpenAIRouteName, PathfinderPolicyName } from './enums';
import {
OpenAIRouteName,
PathfinderPolicyName,
// FIXME - move to privacy-types
UnstructuredSubDataPointRecommendationStatus,
} from './enums';
import { LanguageKey } from '@transcend-io/internationalization';

/**
Expand Down Expand Up @@ -268,6 +273,29 @@ export const DataCategoryGuessInput = t.intersection([
/** Type override */
export type DataCategoryGuessInput = t.TypeOf<typeof DataCategoryGuessInput>;

/**
* A guessed data category from the content classifier
*/
export const DataCategoryRecommendationInput = t.intersection([
t.type({
/** The parent category */
category: DataCategoryPreviewInput,
/** Status of guess */
status: valuesOf(UnstructuredSubDataPointRecommendationStatus),
/** Confidence level of guess */
confidence: t.number,
}),
t.partial({
/** classifier version that produced the guess */
classifierVersion: t.number,
}),
]);

/** Type override */
export type DataCategoryRecommendationInput = t.TypeOf<
typeof DataCategoryRecommendationInput
>;

export const AttributeValueInput = t.intersection([
t.type({
/** Name of attribute value */
Expand Down
1 change: 1 addition & 0 deletions src/data-inventory/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export * from './pullAllDatapoints';
export * from './pullUnstructuredSubDataPointRecommendations';
176 changes: 176 additions & 0 deletions src/data-inventory/pullUnstructuredSubDataPointRecommendations.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import cliProgress from 'cli-progress';
import { gql } from 'graphql-request';
import colors from 'colors';
import sortBy from 'lodash/sortBy';
import type { GraphQLClient } from 'graphql-request';
import type { DataCategoryInput } from '../codecs';
import type { UnstructuredSubDataPointRecommendationStatus } from '../enums';
import { SUB_DATA_POINTS_COUNT, makeGraphQLRequest } from '../graphql';
import { logger } from '../logger';
import type { DatapointFilterOptions } from './pullAllDatapoints';

interface UnstructuredSubDataPointRecommendationCsvPreview {
/** ID of subDatapoint */
id: string;
/** Name (or key) of the subdatapoint */
name: string;
/** Personal data category */
categories: DataCategoryInput[];
/** Scanned object ID */
scannedObjectId: string;
/** Scanned object path ID */
scannedObjectPathId: string;
/** The data silo ID */
dataSiloId: string;
/** Data category guesses that are output by the classifier */
pendingCategoryGuesses?: {
/** Data category being guessed */
category: DataCategoryInput;
/** Status of recommendation */
status: UnstructuredSubDataPointRecommendationStatus;
/** classifier version that produced the guess */
classifierVersion: number;
}[];
}

/**
* Pull unstructured subdatapoint information
*
* @param client - Client to use for the request
* @param options - Options
*/
export async function pullUnstructuredSubDataPointRecommendations(
client: GraphQLClient,
{
dataSiloIds = [],
// includeGuessedCategories,
parentCategories = [],
subCategories = [],
pageSize = 1000,
}: DatapointFilterOptions & {
/** Page size to pull in */
pageSize?: number;
} = {},
): Promise<UnstructuredSubDataPointRecommendationCsvPreview[]> {
const unstructuredSubDataPointRecommendations: UnstructuredSubDataPointRecommendationCsvPreview[] =
[];

// Time duration
const t0 = new Date().getTime();

// create a new progress bar instance and use shades_classic theme
const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);

// Filters
const filterBy = {
...(parentCategories.length > 0 ? { category: parentCategories } : {}),
...(subCategories.length > 0 ? { subCategoryIds: subCategories } : {}),
// if parentCategories or subCategories and not includeGuessedCategories
// ...(parentCategories.length + subCategories.length > 0 &&
// !includeGuessedCategories
// ? // then only show data points with approved data categories
// // FIXME should include validated, corrected, manually added; should exclude classified and rejected
// { status: UnstructuredSubDataPointRecommendationStatus.Validated }
// : {}),
...(dataSiloIds.length > 0 ? { dataSilos: dataSiloIds } : {}),
};

// Build a GraphQL client
const {
unstructuredSubDataPointRecommendations: { totalCount },
} = await makeGraphQLRequest<{
/** Query response */
unstructuredSubDataPointRecommendations: {
/** Count */
totalCount: number;
};
}>(client, SUB_DATA_POINTS_COUNT, {
filterBy,
});

logger.info(colors.magenta('[Step 1/3] Pulling in all subdatapoints'));

progressBar.start(totalCount, 0);
let total = 0;
let shouldContinue = false;
let cursor: string | undefined;
let offset = 0;
do {
try {
const {
unstructuredSubDataPointRecommendations: { nodes },
// eslint-disable-next-line no-await-in-loop
} = await makeGraphQLRequest<{
/** Query response */
unstructuredSubDataPointRecommendations: {
/** List of matches */
nodes: UnstructuredSubDataPointRecommendationCsvPreview[];
};
}>(
client, // FIXME below incomplete
gql`
query TranscendCliUnstructuredSubDataPointRecommendationCsvExport(
$filterBy: SubDataPointFiltersInput
$first: Int!
$offset: Int!
) {
unstructuredSubDataPointRecommendations(
filterBy: $filterBy
first: $first
offset: $offset
useMaster: false
) {
nodes {
id
name
categories {
name
category
}
}
}
}
`,
{
first: pageSize,
offset,
filterBy: {
...filterBy,
},
},
);

cursor = nodes[nodes.length - 1]?.id as string;
unstructuredSubDataPointRecommendations.push(...nodes);
shouldContinue = nodes.length === pageSize;
total += nodes.length;
offset += nodes.length;
progressBar.update(total);
} catch (err) {
logger.error(
colors.red(
`An error fetching subdatapoints for cursor ${cursor} and offset ${offset}`,
),
);
throw err;
}
} while (shouldContinue);

progressBar.stop();
const t1 = new Date().getTime();
const totalTime = t1 - t0;

const sorted = sortBy(unstructuredSubDataPointRecommendations, 'name');

logger.info(
colors.green(
`Successfully pulled in ${sorted.length} subdatapoints in ${
totalTime / 1000
} seconds!`,
),
);
return sorted;
}
18 changes: 18 additions & 0 deletions src/enums.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,21 @@ export const OpenAIRouteName = makeEnum({
*/
export type OpenAIRouteName =
(typeof OpenAIRouteName)[keyof typeof OpenAIRouteName];

// FIXME - move to privacy-types
export const UnstructuredSubDataPointRecommendationStatus = makeEnum({
/** The category was manually applied */
ManuallyAdded: 'MANUALLY_ADDED',
/** The recommendation has been corrected */
Corrected: 'CORRECTED',
/** The recommendation has been approved as valid */
Validated: 'VALIDATED',
/** The recommendation is has been made but not validated */
Classified: 'CLASSIFIED',
/** The recommendation has been marked as wrong */
Rejected: 'REJECTED',
});

/** Type override */
export type UnstructuredSubDataPointRecommendationStatus =
(typeof UnstructuredSubDataPointRecommendationStatus)[keyof typeof UnstructuredSubDataPointRecommendationStatus];
6 changes: 5 additions & 1 deletion src/graphql/makeGraphQLRequest.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import { GraphQLClient, RequestDocument, Variables } from 'graphql-request';
import type {
GraphQLClient,
RequestDocument,
Variables,
} from 'graphql-request';
import { logger } from '../logger';
import colors from 'colors';

Expand Down

0 comments on commit 398099c

Please sign in to comment.