Skip to content

Commit

Permalink
feat(py/core): implemented json extractor library (based on JS impl) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
pavelgj authored Mar 1, 2025
1 parent 25c1efe commit 91a73c1
Show file tree
Hide file tree
Showing 4 changed files with 420 additions and 0 deletions.
2 changes: 2 additions & 0 deletions py/packages/genkit/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies = [
"pydantic>=2.10.5",
"requests>=2.32.3",
"dotprompt",
"partial-json-parser>=0.2.1.1.post5",
"json5>=0.10.0",
]
description = "Genkit AI Framework"
license = { text = "Apache-2.0" }
Expand Down
238 changes: 238 additions & 0 deletions py/packages/genkit/src/genkit/core/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
# Copyright 2025 Google LLC
# SPDX-License-Identifier: Apache-2.0

"""Utility functions for extracting JSON data from text and markdown."""

from typing import Any

import json5
from partial_json_parser import loads

CHAR_NON_BREAKING_SPACE = '\u00a0'


def parse_partial_json(json_string: str) -> Any:
"""Parses a partially complete JSON string and returns the parsed object.
This function attempts to parse the given JSON string, even if it is not
a complete or valid JSON document.
Args:
json_string: The string to parse as JSON.
Returns:
The parsed JSON object.
Raises:
AssertionError: If the string cannot be parsed as JSON.
"""
# TODO: add handling for malformed JSON cases.
return loads(json_string)


def extract_json(text: str, throw_on_bad_json: bool = True) -> Any:
"""
Extracts JSON from a string with lenient parsing.
This function attempts to extract a valid JSON object or array from a
string, even if the string contains extraneous characters or minor
formatting issues. It uses a combination of basic parsing and
`json5` and `partial-json` libraries to maximize the chance of
successful extraction.
Args:
text: The string to extract JSON from.
throw_on_bad_json: If True, raises a ValueError if no valid JSON
can be extracted. If False, returns None in such cases.
Returns:
The extracted JSON object (dict or list), or None if no valid
JSON is found and `throw_on_bad_json` is False.
Raises:
ValueError: If `throw_on_bad_json` is True and no valid JSON
can be extracted.
Examples:
>>> extract_json(' { "key" : "value" } ')
{'key': 'value'}
>>> extract_json('{"key": "value",}') # Trailing comma
{'key': 'value'}
>>> extract_json('some text {"key": "value"} more text')
{'key': 'value'}
>>> extract_json('invalid json', throw_on_bad_json=False)
None
"""
opening_char = None
closing_char = None
start_pos = None
nesting_count = 0
in_string = False
escape_next = False

for i in range(len(text)):
char = text[i].replace(CHAR_NON_BREAKING_SPACE, ' ')

if escape_next:
escape_next = False
continue

if char == '\\':
escape_next = True
continue

if char == '"':
in_string = not in_string
continue

if in_string:
continue

if not opening_char and (char == '{' or char == '['):
# Look for opening character
opening_char = char
closing_char = '}' if char == '{' else ']'
start_pos = i
nesting_count += 1
elif char == opening_char:
# Increment nesting for matching opening character
nesting_count += 1
elif char == closing_char:
# Decrement nesting for matching closing character
nesting_count -= 1
if not nesting_count:
# Reached end of target element
return json5.loads(text[start_pos or 0 : i + 1])
if start_pos is not None and nesting_count > 0:
# If an incomplete JSON structure is detected
try:
# Parse the incomplete JSON structure using partial-json for lenient parsing
return parse_partial_json(text[start_pos:])
except:
# If parsing fails, throw an error
if throw_on_bad_json:
raise ValueError(
f'Invalid JSON extracted from model output: {text}'
)
return None

if throw_on_bad_json:
raise ValueError(f'Invalid JSON extracted from model output: {text}')
return None


class ExtractItemsResult:
"""Result of array item extraction."""

def __init__(self, items: list, cursor: int):
self.items = items
self.cursor = cursor


def extract_items(text: str, cursor: int = 0) -> ExtractItemsResult:
"""
Extracts complete JSON objects from the first array found in the text.
This function searches for the first JSON array within the input string,
starting from an optional cursor position. It extracts complete JSON
objects from this array and returns them along with an updated cursor
position, indicating how much of the string has been processed.
Args:
text: The string to extract items from.
cursor: The starting position for searching the array (default: 0).
Useful for processing large strings in chunks.
Returns:
An `ExtractItemsResult` object containing:
- `items`: A list of extracted JSON objects (dictionaries).
- `cursor`: The updated cursor position, which is the index
immediately after the last processed character. If no array is
found, the cursor will be the length of the text.
Examples:
>>> text = '[{"a": 1}, {"b": 2}, {"c": 3}]'
>>> result = extract_items(text)
>>> result.items
[{'a': 1}, {'b': 2}, {'c': 3}]
>>> result.cursor
29
>>> text = ' [ {"x": 10}, {"y": 20} ] '
>>> result = extract_items(text)
>>> result.items
[{'x': 10}, {'y': 20}]
>>> result.cursor
25
>>> text = 'some text [ {"p": 100} , {"q": 200} ] more text'
>>> result = extract_items(text, cursor=10)
>>> result.items
[{'p': 100}, {'q': 200}]
>>> result.cursor
35
>>> text = 'no array here'
>>> result = extract_items(text)
>>> result.items
[]
>>> result.cursor
13
"""
items = []
current_cursor = cursor

# Find the first array start if we haven't already processed any text
if cursor == 0:
array_start = text.find('[')
if array_start == -1:
return ExtractItemsResult(items=[], cursor=len(text))
current_cursor = array_start + 1

object_start = -1
brace_count = 0
in_string = False
escape_next = False

# Process the text from the cursor position
for i in range(current_cursor, len(text)):
char = text[i]

if escape_next:
escape_next = False
continue

if char == '\\':
escape_next = True
continue

if char == '"':
in_string = not in_string
continue

if in_string:
continue

if char == '{':
if brace_count == 0:
object_start = i
brace_count += 1
elif char == '}':
brace_count -= 1
if brace_count == 0 and object_start != -1:
try:
obj = json5.loads(text[object_start : i + 1])
items.append(obj)
current_cursor = i + 1
object_start = -1
except:
# If parsing fails, continue
pass
elif char == ']' and brace_count == 0:
# End of array
break

return ExtractItemsResult(items=items, cursor=current_cursor)
Loading

0 comments on commit 91a73c1

Please sign in to comment.