From 3ff33bb1f1c335b3f557bf8cbf08b109b34ee86b Mon Sep 17 00:00:00 2001 From: David Roher Date: Thu, 14 Dec 2023 13:17:30 -0500 Subject: [PATCH] 2024.0.0 (#78) * Move schemas around for easier packaging * Various conf changes * Fix drill/clickhouse * Add baseball.computer note --- .env | 9 +- README.md | 6 + docker-compose.yml | 6 +- extract/Dockerfile | 9 +- extract/parsers/retrosheet.py | 41 ++- extract/requirements.txt | 2 +- load/Dockerfile | 16 +- load/postgres_cstore_fdw/Dockerfile | 2 +- tests/test_transform.py | 2 +- transform/csv.Dockerfile | 2 +- transform/ddl.Dockerfile | 4 +- transform/requirements.txt | 2 +- transform/src/boxball_schemas/__init__.py | 6 + .../baseballdatabank.py | 0 .../retrosheet.py | 308 ++++++++++-------- transform/src/ddl_maker.py | 2 +- transform/src/parquet.py | 8 +- transform/src/schemas/__init__.py | 6 - transform/src/setup.py | 12 + 19 files changed, 263 insertions(+), 180 deletions(-) create mode 100644 transform/src/boxball_schemas/__init__.py rename transform/src/{schemas => boxball_schemas}/baseballdatabank.py (100%) rename transform/src/{schemas => boxball_schemas}/retrosheet.py (82%) delete mode 100644 transform/src/schemas/__init__.py create mode 100644 transform/src/setup.py diff --git a/.env b/.env index 8206044..9708962 100644 --- a/.env +++ b/.env @@ -1,7 +1,8 @@ -CHADWICK_VERSION=v0.9.5 -BASEBALLDATABANK_VERSION=ccb3cef05e68f0085db4ada6d4a9ebab9435b452 -RETROSHEET_VERSION=48334a58f7446d59746d81aa73c3e9fa9b2676e9 +RETROSHEET_VERSION=8449632be02cdf743932600f3218d77e059d5c91 +CHADWICK_VERSION=aff8d779500da16521542e084c35cc3e159fd536 +BASEBALLDATABANK_VERSION=28169eaf9007200d7f51160713c647eac64f9aa8 EXTRACT_DIR=extract REPO=doublewick/boxball -VERSION=2023.0.0 +VERSION=2024.0.0 +BUILD_ENV=prod diff --git a/README.md b/README.md index 4744012..5a0b7cc 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,12 @@

+**Update**: I have released a new project, [baseball.computer](https://baseball.computer), which is designed +as the successor to boxball. It is much easier to use (no Docker required, runs entirely in your browser/program) +and includes many more tables, features, and quality controls. The event schema is different, which will be the main migration pain point in +migration. _I aim to continue Boxball maintenence and updates as long as people are still using it,_ and I may try to rebase +boxball on top of the new project to make maintaining both easier. Please let me know if there are things you can do in Boxball that you can't do yet in baseball.computer by filing an issue on the [repo](https://github.com/droher/baseball.computer) or reaching me at david.roher@baseball.computer. + ## Introduction **Boxball** creates prepopulated databases of the two most significant open source baseball datasets: [Retrosheet](http://retrosheet.org) and the [Baseball Databank](https://github.com/chadwickbureau/baseballdatabank). diff --git a/docker-compose.yml b/docker-compose.yml index 38ba960..9bf1c24 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -58,8 +58,7 @@ x-clickhouse: x-drill: &drill build: - context: load/drill - dockerfile: ../Dockerfile + context: load target: drill platforms: - "linux/amd64" @@ -126,8 +125,7 @@ x-mysql: x-sqlite: &sqlite build: - context: load/sqlite - dockerfile: ../Dockerfile + context: load target: sqlite platforms: - "linux/amd64" diff --git a/extract/Dockerfile b/extract/Dockerfile index 9191fa3..e2f9726 100644 --- a/extract/Dockerfile +++ b/extract/Dockerfile @@ -2,7 +2,7 @@ ARG BUILD_ENV ARG RETROSHEET_IMAGE=get-retrosheet-${BUILD_ENV} ARG BASEBALLDATABANK_IMAGE=get-baseballdatabank-${BUILD_ENV} -FROM python:3.11-alpine3.17 AS build-common +FROM python:3.11-alpine3.19 AS build-common RUN apk add --no-cache \ parallel \ libtool \ @@ -22,14 +22,15 @@ ENV PYTHONPATH="/" # `prod` gets the full datasets, while `test` provides fixtures with small sample data for each file FROM build-common as get-retrosheet-prod ARG RETROSHEET_VERSION -RUN wget https://github.com/droher/retrosheet/archive/${RETROSHEET_VERSION}.zip -O retrosheet.zip +RUN wget https://github.com/droher/retrosheet-mirror/archive/${RETROSHEET_VERSION}.zip -O retrosheet.zip FROM build-common as get-retrosheet-test COPY fixtures/raw/retrosheet.zip . FROM build-common as get-baseballdatabank-prod ARG BASEBALLDATABANK_VERSION -RUN wget https://github.com/chadwickbureau/baseballdatabank/archive/${BASEBALLDATABANK_VERSION}.zip -O baseballdatabank.zip +# Temporarily grab from old fork until 2023 data appears +RUN wget https://github.com/tom-719/baseballdatabank/archive/${BASEBALLDATABANK_VERSION}.zip -O baseballdatabank.zip FROM build-common as get-baseballdatabank-test COPY fixtures/raw/baseballdatabank.zip . @@ -71,7 +72,7 @@ RUN python -u /parsers/baseballdatabank.py # Use a skinny build for deployment -FROM alpine:3.9.3 +FROM alpine:3.19.0 RUN apk add zstd WORKDIR /extract COPY --from=extract-baseballdatabank /parsed ./baseballdatabank diff --git a/extract/parsers/retrosheet.py b/extract/parsers/retrosheet.py index 3a7fdf8..e93e38a 100644 --- a/extract/parsers/retrosheet.py +++ b/extract/parsers/retrosheet.py @@ -3,6 +3,7 @@ import sys from functools import lru_cache from pathlib import Path +import shutil import fileinput from typing import Callable, Set @@ -15,8 +16,8 @@ RETROSHEET_PATH = Path("retrosheet") CODE_TABLES_PATH = Path("code_tables") -RETROSHEET_SUBDIRS = "gamelog", "schedule", "misc", "rosters", "event" -EVENT_FOLDERS = "asg", "post", "regular" +RETROSHEET_SUBDIRS = "gamelogs", "schedules", "rosters" +EVENT_FOLDERS = "allstar", "postseason", "events" PARSE_FUNCS = { "daily": "cwdaily -q -y {year} {year}*", @@ -112,26 +113,36 @@ def concat_files(input_path: Path, output_file: Path, glob: str = "*", prepend_filename: bool = False, strip_header: bool = False, check_dupes: bool = True): - files = (f for f in input_path.glob(glob) if f.is_file()) + files = [f for f in input_path.glob(glob) if f.is_file()] + if not files: + raise ValueError(f"No files found under {input_path} with glob {glob}") with open(output_file, 'wt') as fout, fileinput.input(files) as fin: lines = set() for line in fin: + year = Path(fin.filename()).stem[-4:] # Remove DOS EOF character (CRTL+Z) new_line = line.strip(DOS_EOF) + original_line = new_line if not new_line or new_line.isspace(): continue if fin.isfirstline() and strip_header: continue if prepend_filename: - year = Path(fin.filename()).stem[-4:] - new_line = "{},{}".format(year, new_line) + new_line = f"{year},{new_line}" if new_line in lines: - print("Duplicate row in {}: {}".format(fin.filename(), new_line), file=sys.stderr) + print(f"Duplicate row in {fin.filename()}: {original_line.strip()}") + continue + # TODO: Fix NLB roster file shape in raw data + if "roster" in output_file.name and len(new_line.split(",")) == 7: + print(f"Fixing row in file {fin.filename()} with missing data: " + original_line.strip()) + new_line = new_line.strip() + "," + elif "roster" in output_file.name and len(new_line.split(",")) < 7: + print(f"Skipping row in file {fin.filename()} with missing data: " + original_line.strip()) continue if check_dupes: lines.add(new_line) - fout.write(new_line) - return compress(output_file, OUTPUT_PATH) + fout.write(new_line.strip() + "\n") + return compress(output_file, OUTPUT_PATH) retrosheet_base = Path(RETROSHEET_PATH) output_base = Path(OUTPUT_PATH) @@ -139,20 +150,24 @@ def concat_files(input_path: Path, output_file: Path, glob: str = "*", subdirs = {subdir: retrosheet_base / subdir for subdir in RETROSHEET_SUBDIRS} print("Writing simple files...") - concat_files(subdirs["gamelog"], output_base / "gamelog.csv", glob="*.TXT", check_dupes=False) - concat_files(subdirs["schedule"], output_base / "schedule.csv", glob="*.TXT") - concat_files(subdirs["misc"], output_base / "park.csv", glob="parkcode.txt", strip_header=True) + concat_files(subdirs["gamelogs"], output_base / "gamelog.csv", glob="gl*.txt", check_dupes=False) + # TODO: Figure out how to integrate 2020-orig (leave out for now) + concat_files(subdirs["schedules"], output_base / "schedule.csv", glob="*schedule.csv", strip_header=True) + concat_files(retrosheet_base, output_base / "park.csv", glob="ballparks.csv", strip_header=True) + concat_files(retrosheet_base, output_base / "bio.csv", glob="biofile.csv", strip_header=True) concat_files(subdirs["rosters"], output_base / "roster.csv", glob="*.ROS", prepend_filename=True) @staticmethod def parse_event_types(use_parallel=True) -> None: def parse_events(output_type: str, clean_func: Callable = None): - event_base = RETROSHEET_PATH / "event" + event_base = RETROSHEET_PATH output_file = OUTPUT_PATH.joinpath(output_type).with_suffix(".csv") command_template = PARSE_FUNCS[output_type] f_out_inflated = open(output_file, 'w') for folder in EVENT_FOLDERS: - print(output_type, folder) + # Copy (not move) all teamfiles to each subdir + for teamfile in event_base.glob("teams/TEAM*"): + shutil.copy(teamfile, event_base.joinpath(folder)) data_path = event_base.joinpath(folder) years = {re.match("[0-9]{4}", f.stem)[0] for f in data_path.iterdir() if re.match("[0-9]{4}", f.stem)} diff --git a/extract/requirements.txt b/extract/requirements.txt index 2bd379c..c45b24f 100644 --- a/extract/requirements.txt +++ b/extract/requirements.txt @@ -1,2 +1,2 @@ pyhumps==1.6.1 -zstandard==0.15.2 +zstandard==0.22.0 diff --git a/load/Dockerfile b/load/Dockerfile index 30bff86..ccac610 100644 --- a/load/Dockerfile +++ b/load/Dockerfile @@ -1,7 +1,9 @@ ARG VERSION FROM doublewick/boxball:ddl-${VERSION} as ddl +FROM doublewick/boxball:csv-${VERSION} as csv +FROM doublewick/boxball:parquet-${VERSION} as parquet -FROM yandex/clickhouse-server:22.9.7.34 as clickhouse +FROM clickhouse/clickhouse-server:23.11.2.11 as clickhouse COPY z_load.sh /docker-entrypoint-initdb.d/ COPY --chown=clickhouse:clickhouse --from=ddl /ddl/clickhouse.sql /docker-entrypoint-initdb.d/ COPY --chown=clickhouse:clickhouse --from=parquet /transform/parquet /data @@ -9,7 +11,7 @@ COPY --chown=clickhouse:clickhouse --from=parquet /transform/parquet /data FROM drill/apache-drill:1.17.0 as drill COPY --from=parquet /transform/parquet /data -FROM mysql:8.0.31-debian as mysql +FROM mysql:8.0.35-debian as mysql ENV MYSQL_ALLOW_EMPTY_PASSWORD=yes COPY my.cnf /etc/mysql/conf.d/ COPY A_unzip_csvs.sh z_remove_csvs.sh /docker-entrypoint-initdb.d/ @@ -19,7 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \ COPY --chown=mysql:mysql --from=ddl /ddl/mysql.sql /docker-entrypoint-initdb.d/ COPY --chown=mysql:mysql --from=csv /transform/csv /data -FROM postgres:15.1 as postgres +FROM postgres:16.1-bookworm as postgres RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -27,7 +29,7 @@ COPY A_build_conf.sql z_run_conf.sql /docker-entrypoint-initdb.d/ COPY --chown=postgres:postgres --from=ddl /ddl/postgres.sql /docker-entrypoint-initdb.d/ COPY --chown=postgres:postgres --from=csv /transform/csv /data -FROM postgres:13.2 as postgres-cstore-fdw-build +FROM postgres:13.13-bookworm as postgres-cstore-fdw-build RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-13 build-essential zstd libprotobuf-c-dev protobuf-c-compiler wget ca-certificates unzip make gcc libpq-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -45,7 +47,7 @@ RUN cat /docker-entrypoint-initdb.d/postgres_cstore_fdw.sql FROM postgres-cstore-fdw-build as postgres-cstore-fdw -FROM alpine:3.17 as sqlite-build +FROM alpine:3.19.0 as sqlite-build RUN apk add --no-cache \ zstd \ sqlite @@ -60,10 +62,10 @@ RUN echo "Decompressing fies..." && \ zstd --rm boxball.db -FROM python:3.11-alpine3.17 AS sqlite +FROM python:3.11-alpine3.19 AS sqlite RUN apk add --no-cache \ zstd \ sqlite RUN pip install sqlite-web==0.4.1 -COPY --from=build boxball.db.zst /tmp/ +COPY --from=sqlite-build boxball.db.zst /tmp/ ENTRYPOINT zstd --rm -d /tmp/boxball.db.zst -fo /db/boxball.db && sqlite_web -H 0.0.0.0 -x /db/boxball.db diff --git a/load/postgres_cstore_fdw/Dockerfile b/load/postgres_cstore_fdw/Dockerfile index 3a7aa8a..103a9d7 100644 --- a/load/postgres_cstore_fdw/Dockerfile +++ b/load/postgres_cstore_fdw/Dockerfile @@ -2,7 +2,7 @@ ARG VERSION FROM doublewick/boxball:ddl-${VERSION} as ddl FROM doublewick/boxball:csv-${VERSION} as csv -FROM postgres:13.2 as build +FROM postgres:13.13-bookworm as build RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-13 build-essential zstd libprotobuf-c-dev protobuf-c-compiler wget ca-certificates unzip make gcc libpq-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/tests/test_transform.py b/tests/test_transform.py index 241a8fc..c957114 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -2,7 +2,7 @@ from pathlib import Path from src import OUTPUT_PATH -from src.schemas import retrosheet_metadata, baseballdatabank_metadata, all_metadata +from src.boxball_schemas import retrosheet_metadata, baseballdatabank_metadata, all_metadata from src.ddl_factories import all_factories from src.parquet import write_files, PARQUET_PREFIX diff --git a/transform/csv.Dockerfile b/transform/csv.Dockerfile index aa8e6ab..27096aa 100644 --- a/transform/csv.Dockerfile +++ b/transform/csv.Dockerfile @@ -3,5 +3,5 @@ ARG VERSION FROM doublewick/boxball:extract-${VERSION} as extract -FROM alpine:3.9.3 +FROM alpine:3.19.0 COPY --from=extract /extract /transform/csv \ No newline at end of file diff --git a/transform/ddl.Dockerfile b/transform/ddl.Dockerfile index 9c00d56..f10d2ab 100644 --- a/transform/ddl.Dockerfile +++ b/transform/ddl.Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-slim-bullseye AS build-common +FROM python:3.11-slim-bookworm AS build-common COPY requirements.txt . RUN pip install -r requirements.txt ENV PYTHONPATH="/" @@ -7,5 +7,5 @@ COPY src/ src/ FROM build-common as build-ddl RUN python -u src/ddl_maker.py -FROM alpine:3.9.3 +FROM alpine:3.19.0 COPY --from=build-ddl /ddl /ddl diff --git a/transform/requirements.txt b/transform/requirements.txt index 1796254..9b48d44 100644 --- a/transform/requirements.txt +++ b/transform/requirements.txt @@ -2,4 +2,4 @@ SQLAlchemy==1.3.23 sqlalchemy-fdw==0.3.0 clickhouse-sqlalchemy==0.1.5 pyarrow==14.0.1 -zstandard==0.17.0 +zstandard==0.22.0 diff --git a/transform/src/boxball_schemas/__init__.py b/transform/src/boxball_schemas/__init__.py new file mode 100644 index 0000000..dc7b0b2 --- /dev/null +++ b/transform/src/boxball_schemas/__init__.py @@ -0,0 +1,6 @@ +from typing import List +from sqlalchemy import MetaData +from .retrosheet import metadata as retrosheet_metadata +from .baseballdatabank import metadata as baseballdatabank_metadata + +all_metadata: List[MetaData] = [baseballdatabank_metadata, retrosheet_metadata] diff --git a/transform/src/schemas/baseballdatabank.py b/transform/src/boxball_schemas/baseballdatabank.py similarity index 100% rename from transform/src/schemas/baseballdatabank.py rename to transform/src/boxball_schemas/baseballdatabank.py diff --git a/transform/src/schemas/retrosheet.py b/transform/src/boxball_schemas/retrosheet.py similarity index 82% rename from transform/src/schemas/retrosheet.py rename to transform/src/boxball_schemas/retrosheet.py index 11a6b33..cd24675 100644 --- a/transform/src/schemas/retrosheet.py +++ b/transform/src/boxball_schemas/retrosheet.py @@ -34,14 +34,14 @@ class Comment(Base): game_id = Column(CHAR(12), doc="Game ID (home team ID + YYYYMMDD + doubleheader flag") event_id = Column(SmallInteger, doc="Commented event number") - comment = Column(String(1638), doc="Comment text") - ejected_person_id = Column(String(256), doc="ID of ejected person") - ejected_person_role_cd = Column(String(256)) - eject_umpire_id = Column(String(256), doc="ID of umpire who ejected person") - eject_reason = Column(String(1639)) - umpchange_inning = Column(String(256)) - umpchange_position = Column(String(256)) - umpchange_person_id = Column(String(256), doc="ID of new umpire") + comment = Column(String(2048), doc="Comment text") + ejected_person_id = Column(String(1024), doc="ID of ejected person") + ejected_person_role_cd = Column(String(1024)) + eject_umpire_id = Column(String(1024), doc="ID of umpire who ejected person") + eject_reason = Column(String(1024)) + umpchange_inning = Column(String(1024)) + umpchange_position = Column(String(1024)) + umpchange_person_id = Column(String(1024), doc="ID of new umpire") dummy_id = Column(Integer, autoincrement=True, primary_key=True) @@ -52,33 +52,77 @@ class Park(Base): __tablename__ = 'park' park_id = Column(CHAR(5), primary_key=True, doc="Park ID") - name = Column(String(41), doc="Park name") - aka = Column(String(55), doc="Common park alias") - city = Column(String(17), doc="City") - state = Column(String(9), doc="State") + name = Column(String(1024), doc="Park name") + aka = Column(String(1024), doc="Common park alias") + city = Column(String(1024), doc="City") + state = Column(String(1024), doc="State") # TODO: Handle this MySQL edge case so these can be dates again - start_date = Column(String(10), doc="First game") - end_date = Column(String(10), doc="Last game") - league = Column(CHAR(2), doc="League ID") - notes = Column(String(54), doc="Misc. notes") + start_date = Column(String(1024), doc="First game") + end_date = Column(String(1024), doc="Last game") + league = Column(CHAR(10), doc="League ID") + notes = Column(String(1024), doc="Misc. notes") class Roster(Base): """ - Contains one row for each unique combination of player, team, and year. For more detailed/convenient player - biographical data, use the `people` table from the Baseball Databank schema, joining on `retro_id`. + Contains one row for each unique combination of player, team, and year. There may be duplicates here. + For more detailed/convenient player + biographical data, use the `people` table from the Baseball Databank schema, joining on `retro_id`, or the bio + table below. """ __tablename__ = 'roster' # We inserted the year in preprocessing - year = Column(Integer, primary_key=True, doc="Year of roster") - player_id = Column(CHAR(8), primary_key=True, doc="Player ID") - last_name = Column(String(32), doc="Player last name") - first_name = Column(String(32), doc="Player first name") + year = Column(Integer, doc="Year of roster") + player_id = Column(CHAR(8), doc="Player ID") + last_name = Column(String(1024), doc="Player last name") + first_name = Column(String(1024), doc="Player first name") bats = Column(CHAR(1), doc="Bat handedness") throws = Column(CHAR(1), doc="Throw handedness") - team_id = Column(CHAR(3), primary_key=True, doc="Team ID") + team_id = Column(CHAR(3), doc="Team ID") # TODO: Remove duplicate roster entry(s) - position = Column(String(2), primary_key=True, doc="Primary fielding position") + position = Column(String(1024), doc="Primary fielding position") + dummy_id = Column(Integer, autoincrement=True, primary_key=True) + + +class Bio(Base): + """ + Contains one row for each player + """ + __tablename__ = 'bio' + # We inserted the year in preprocessing + player_id = Column(CHAR(8), primary_key=True, doc="Player ID") + last = Column(String(1024), doc="Player last name") + first = Column(String(1024), doc="Player first name") + nickname = Column(String(1024), doc="Player nickname") + birthdate = Column(String(1024), doc="Player birth date") + birth_city = Column(String(1024), doc="Player birth city") + birth_state = Column(String(1024), doc="Player birth state") + birth_country = Column(String(1024), doc="Player birth country") + play_debut = Column(String(1024), doc="Player debut date") + play_lastgame = Column(String(1024), doc="Player last game date") + mgr_debut = Column(String(1024), doc="Manager debut date") + mgr_lastgame = Column(String(1024), doc="Manager last game date") + coach_debut = Column(String(1024), doc="Coach debut date") + coach_lastgame = Column(String(1024), doc="Coach last game date") + ump_debut = Column(String(1024), doc="Umpire debut date") + ump_lastgame = Column(String(1024), doc="Umpire last game date") + deathdate = Column(String(1024), doc="Player death date") + death_city = Column(String(1024), doc="Player death city") + death_state = Column(String(1024), doc="Player death state") + death_country = Column(String(1024), doc="Player death country") + bats = Column(CHAR(1), doc="Bat handedness") + throws = Column(CHAR(1), doc="Throw handedness") + height = Column(String(1024), doc="Player height in inches") + weight = Column(String(1024), doc="Player weight in pounds") + cemetary = Column(String(1024), doc="Player burial site") + ceme_city = Column(String(1024), doc="Player burial city") + ceme_state = Column(String(1024), doc="Player burial state") + ceme_country = Column(String(1024), doc="Player burial country") + ceme_note = Column(String(1024), doc="Player burial notes") + birth_name = Column(String(1024), doc="Player birth name") + name_chg = Column(String(1024), doc="Player name change notes") + bat_chg = Column(String(1024), doc="Player batting change notes") + hof = Column(String(1024), doc="String indicating Hall of Fame status") class Schedule(Base): @@ -98,7 +142,7 @@ class Schedule(Base): home_team_league = Column(CHAR(2), doc="Home team league ID") home_team_game_number = Column(Integer, primary_key=True, doc="Home team game number") day_night = Column(CHAR(1), doc="D - day, N - night") - postponement_indicator = Column(String(120), doc=""" + postponement_indicator = Column(String(1024), doc=""" This field will contain one or more phrases related to the game if it was not played as scheduled. If there is more than one phrase, they are separated by a semi-colon (";"). There are three possible outcomes for games not played @@ -107,7 +151,7 @@ class Schedule(Base): -- The game was played on the original date but at another site -- The game was not played """) - makeup_dates = Column(String(120), doc=""" + makeup_dates = Column(String(1024), doc=""" This field will contain a makeup date if the postponed game was played at another time or place. If an attempt was known to have been made on a date but postponed again, that date will be listed. In that case, there will be a second @@ -125,7 +169,7 @@ class CodeEvent(Base): __tablename__ = 'code_event' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class CodeFieldPark(Base): @@ -135,7 +179,7 @@ class CodeFieldPark(Base): __tablename__ = 'code_field_park' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class CodeMethodRecord(Base): @@ -145,7 +189,7 @@ class CodeMethodRecord(Base): __tablename__ = 'code_method_record' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class CodePitchesRecord(Base): @@ -155,7 +199,7 @@ class CodePitchesRecord(Base): __tablename__ = 'code_pitches_record' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class CodePrecipPark(Base): @@ -165,7 +209,7 @@ class CodePrecipPark(Base): __tablename__ = 'code_precip_park' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class CodeSkyPark(Base): @@ -175,7 +219,7 @@ class CodeSkyPark(Base): __tablename__ = 'code_sky_park' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class CodeWindDirectionPark(Base): @@ -185,7 +229,7 @@ class CodeWindDirectionPark(Base): __tablename__ = 'code_wind_direction_park' code = Column(SmallInteger, primary_key=True, autoincrement=False) - description = Column(String(30)) + description = Column(String(1024)) class DeducedGame(Base): @@ -215,31 +259,31 @@ class Game(Base): game_dt = Column(Date, doc="Game date") game_ct = Column(SmallInteger, doc="Doubleheader flag (0 - only game of day, 1 - first game of doubleheader, " "2 - second game of doubleheader") - game_dy = Column(String(9), doc="Day of week") + game_dy = Column(String(1024), doc="Day of week") start_game_tm = Column(SmallInteger, doc="Game start time (12HMM coded as integer, eg 1015 for 10:15 PM)") - dh_fl = Column(String(1), doc="DH used") - daynight_park_cd = Column(String(1), doc="D - day game, N - night game") + dh_fl = Column(String(1024), doc="DH used") + daynight_park_cd = Column(String(1024), doc="D - day game, N - night game") away_team_id = Column(CHAR(3), doc="Away team ID") home_team_id = Column(CHAR(3), doc="Home team ID") - park_id = Column(String(5), doc="Park ID") + park_id = Column(String(1024), doc="Park ID") away_start_pit_id = Column(CHAR(8), doc="Away team starting pitcher ID") home_start_pit_id = Column(CHAR(8), doc="Home team starting pitcher ID") # 32 rather than 8 to protect against "(unknown)" and names where there should be IDs - base4_ump_id = Column(String(32), doc="Home plate umpire ID") - base1_ump_id = Column(String(32), doc="First base umpire ID") - base2_ump_id = Column(String(32), doc="Second base umpire ID") - base3_ump_id = Column(String(32), doc="Third base umpire ID") + base4_ump_id = Column(String(1024), doc="Home plate umpire ID") + base1_ump_id = Column(String(1024), doc="First base umpire ID") + base2_ump_id = Column(String(1024), doc="Second base umpire ID") + base3_ump_id = Column(String(1024), doc="Third base umpire ID") lf_ump_id = Column(CHAR(8), doc="Left field umpire ID") rf_ump_id = Column(CHAR(8), doc="Right field umpire ID") attend_park_ct = Column(Integer, doc="Attendance") - scorer_record_id = Column(String(50), doc="Scorekeeper") - translator_record_id = Column(String(50), doc="Translator") - inputter_record_id = Column(String(50), doc="Inputter") + scorer_record_id = Column(String(1024), doc="Scorekeeper") + translator_record_id = Column(String(1024), doc="Translator") + inputter_record_id = Column(String(1024), doc="Inputter") # TODO: Figure out how to parse in parquet - input_record_ts = Column(String(20), doc="Date and time of record input") - edit_record_ts = Column(String(20), doc="Date and time of Most recent record edit") - method_record_cd = Column(String(1), doc="How the game was scored (join `code_method_record` for details") - pitches_record_cd = Column(String(1), doc="Highest detail of pitches recorded " + input_record_ts = Column(String(1024), doc="Date and time of record input") + edit_record_ts = Column(String(1024), doc="Date and time of Most recent record edit") + method_record_cd = Column(String(1024), doc="How the game was scored (join `code_method_record` for details") + pitches_record_cd = Column(String(1024), doc="Highest detail of pitches recorded " "(join `code_pitches_record` for details). Note that many games with " "pitch detail do not have that info for all events, so pitch totals " "may not be accurate.") @@ -321,16 +365,16 @@ class Game(Base): "") away_finish_pit_id = Column(CHAR(8), doc="Away team finishing pitcher") home_finish_pit_id = Column(CHAR(8), doc="Home team finishing pitcher") - away_team_league_id = Column(CHAR(1), doc="Away team league (1 char ID)") - home_team_league_id = Column(CHAR(1), doc="Home team league (1 char ID)") + away_team_league_id = Column(CHAR(3), doc="Away team league (1 char ID)") + home_team_league_id = Column(CHAR(3), doc="Home team league (1 char ID)") away_team_game_ct = Column(SmallInteger, doc="Away team game number") home_team_game_ct = Column(SmallInteger, doc="Home team game number") outs_ct = Column(SmallInteger, doc="Length of game in outs") - completion_tx = Column(String(26), doc="Information on completion of game") - forfeit_tx = Column(String(26), doc="Information on forfeit of game") - protest_tx = Column(String(26), doc="Information on protest of game") - away_line_tx = Column(String(26), doc="Away team linescore") - home_line_tx = Column(String(26), doc="Home team linescore") + completion_tx = Column(String(1024), doc="Information on completion of game") + forfeit_tx = Column(String(1024), doc="Information on forfeit of game") + protest_tx = Column(String(1024), doc="Information on protest of game") + away_line_tx = Column(String(1024), doc="Away team linescore") + home_line_tx = Column(String(1024), doc="Home team linescore") away_ab_ct = Column(SmallInteger, doc="Away team at bats") away_2b_ct = Column(SmallInteger, doc="Away team doubles") away_3b_ct = Column(SmallInteger, doc="Away team triples") @@ -381,41 +425,41 @@ class Game(Base): home_pb_ct = Column(SmallInteger, doc="Home team passed balls") home_dp_ct = Column(SmallInteger, doc="Home team double plays turned") home_tp_ct = Column(SmallInteger, doc="Home team triple plays turned") - ump_home_name_tx = Column(String(26), doc="Home plate umpire name") - ump_1b_name_tx = Column(String(26), doc="First base umpire name") - ump_2b_name_tx = Column(String(26), doc="Second base umpire name") - ump_3b_name_tx = Column(String(26), doc="Third base umpire name") - ump_lf_name_tx = Column(String(26), doc="Left field umpire name") - ump_rf_name_tx = Column(String(26), doc="Right field umpire name") + ump_home_name_tx = Column(String(1024), doc="Home plate umpire name") + ump_1b_name_tx = Column(String(1024), doc="First base umpire name") + ump_2b_name_tx = Column(String(1024), doc="Second base umpire name") + ump_3b_name_tx = Column(String(1024), doc="Third base umpire name") + ump_lf_name_tx = Column(String(1024), doc="Left field umpire name") + ump_rf_name_tx = Column(String(1024), doc="Right field umpire name") away_manager_id = Column(CHAR(8), doc="Away manager ID") - away_manager_name_tx = Column(String(26), doc="Away manager name") + away_manager_name_tx = Column(String(1024), doc="Away manager name") home_manager_id = Column(CHAR(8), doc="Home manager ID") - home_manager_name_tx = Column(String(26), doc="Home manager name") - win_pit_name_tx = Column(String(26), doc="Wining pitcher name") - lose_pit_name_tx = Column(String(26), doc="Losing pitcher name") - save_pit_name_tx = Column(String(26), doc="Saving pitcher name") + home_manager_name_tx = Column(String(1024), doc="Home manager name") + win_pit_name_tx = Column(String(1024), doc="Wining pitcher name") + lose_pit_name_tx = Column(String(1024), doc="Losing pitcher name") + save_pit_name_tx = Column(String(1024), doc="Saving pitcher name") goahead_rbi_id = Column(CHAR(8), doc="ID of batter with goahead RBI") - goahead_rbi_name_tx = Column(String(26), doc="Name of batter with goahead RBI") - away_lineup1_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 1") - away_lineup2_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 2") - away_lineup3_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 3") - away_lineup4_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 4") - away_lineup5_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 5") - away_lineup6_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 6") - away_lineup7_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 7") - away_lineup8_bat_name_tx = Column(String(26), doc="Name of away team batter in lineup position 8") - away_lineup9_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 9") - home_lineup1_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 1") - home_lineup2_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 2") - home_lineup3_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 3") - home_lineup4_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 4") - home_lineup5_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 5") - home_lineup6_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 6") - home_lineup7_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 7") - home_lineup8_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 8") - home_lineup9_bat_name_tx = Column(String(26), doc="Name of home team batter in lineup position 9") - add_info_tx = Column(String(26), doc="Additional information") - acq_info_tx = Column(String(26), doc="Acquisition information") + goahead_rbi_name_tx = Column(String(1024), doc="Name of batter with goahead RBI") + away_lineup1_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 1") + away_lineup2_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 2") + away_lineup3_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 3") + away_lineup4_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 4") + away_lineup5_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 5") + away_lineup6_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 6") + away_lineup7_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 7") + away_lineup8_bat_name_tx = Column(String(1024), doc="Name of away team batter in lineup position 8") + away_lineup9_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 9") + home_lineup1_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 1") + home_lineup2_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 2") + home_lineup3_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 3") + home_lineup4_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 4") + home_lineup5_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 5") + home_lineup6_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 6") + home_lineup7_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 7") + home_lineup8_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 8") + home_lineup9_bat_name_tx = Column(String(1024), doc="Name of home team batter in lineup position 9") + add_info_tx = Column(String(1024), doc="Additional information") + acq_info_tx = Column(String(1024), doc="Acquisition information") class Gamelog(Base): @@ -451,7 +495,7 @@ class Gamelog(Base): home_runs_score = Column(SmallInteger, doc="Home team runs scored") length_in_outs = Column(SmallInteger, doc="Game length in outs") day_night = Column(CHAR(1), doc="D - day game, N - night game") - completion_info = Column(String(23), doc=""" + completion_info = Column(String(1024), doc=""" Completion information. If the game was completed at a later date (either due to a suspension or an upheld protest) this field will include: @@ -464,16 +508,16 @@ class Gamelog(Base): All the rest of the information in the record refers to the entire game. """) - forfeit_info = Column(String(3), doc="V - forfeited to away team, H - forfeited to home team, " + forfeit_info = Column(String(1024), doc="V - forfeited to away team, H - forfeited to home team, " "T - ruled a no-decision") - protest_info = Column(String(3), doc="P - protested by unidentified team, V - disallowed protest by away team, " + protest_info = Column(String(1024), doc="P - protested by unidentified team, V - disallowed protest by away team, " "H - disallowed protest by home team, X - upheld protest by away team, " "Y - upheld protest by home team") park_id = Column(CHAR(5), doc="Park ID") attendance = Column(Integer, doc="Attendance") duration = Column(SmallInteger, doc="Time of game in minutes") - vistor_line_score = Column(String(26), doc="Away team line score, e.g. 010000(10)0x") - home_line_score = Column(String(26), doc="Home team line score, e.g. 010000(10)0x") + vistor_line_score = Column(String(1024), doc="Away team line score, e.g. 010000(10)0x") + home_line_score = Column(String(1024), doc="Home team line score, e.g. 010000(10)0x") visitor_ab = Column(SmallInteger, doc="Away team at bats") visitor_h = Column(SmallInteger, doc="Away team hits") visitor_d = Column(SmallInteger, doc="Away team doubles") @@ -531,88 +575,88 @@ class Gamelog(Base): home_db = Column(SmallInteger, doc="Home team double plays turned") home_tp = Column(SmallInteger, doc="Home team triple plays turned") umpire_h_id = Column(CHAR(8), doc="Home plate umpire ID") - umpire_h_name = Column(String(32), doc="Home plate umpire name") + umpire_h_name = Column(String(1024), doc="Home plate umpire name") umpire_1b_id = Column(CHAR(8), doc="First base umpire ID") - umpire_1b_name = Column(String(32), doc="First base umpire name") + umpire_1b_name = Column(String(1024), doc="First base umpire name") umpire_2b_id = Column(CHAR(8), doc="Second base umpire ID") - umpire_2b_name = Column(String(32), doc="Second base umpire name") + umpire_2b_name = Column(String(1024), doc="Second base umpire name") umpire_3b_id = Column(CHAR(8), doc="Third base umpire ID") - umpire_3b_name = Column(String(32), doc="Third base umpire name") + umpire_3b_name = Column(String(1024), doc="Third base umpire name") umpire_lf_id = Column(CHAR(8), doc="Left field umpire ID") - umpire_lf_name = Column(String(32), doc="Left field umpire name") + umpire_lf_name = Column(String(1024), doc="Left field umpire name") umpire_rf_id = Column(CHAR(8), doc="Right field umpire ID") - umpire_rf_name = Column(String(32), doc="Right field umpire name") + umpire_rf_name = Column(String(1024), doc="Right field umpire name") visitor_manager_id = Column(CHAR(8), doc="Away team manager ID") - visitor_manager_name = Column(String(32), doc="Away team manager name") + visitor_manager_name = Column(String(1024), doc="Away team manager name") home_manager_id = Column(CHAR(8), doc="Home team manager ID") - home_manager_name = Column(String(32), doc="Home team manager name") + home_manager_name = Column(String(1024), doc="Home team manager name") winning_pitcher_id = Column(CHAR(8), doc="Winning pitcher ID") - winning_pitcher_name = Column(String(32), doc="Winning pitcher name") + winning_pitcher_name = Column(String(1024), doc="Winning pitcher name") losing_pitcher_id = Column(CHAR(8), doc="Losing pitcher ID") - losing_pitcher_name = Column(String(32), doc="Losing pitcher name") + losing_pitcher_name = Column(String(1024), doc="Losing pitcher name") saving_pitcher_id = Column(CHAR(8), doc="Saving pitcher ID") - saving_pitcher_name = Column(String(32), doc="Saving pitcher name") + saving_pitcher_name = Column(String(1024), doc="Saving pitcher name") game_winning_rbi_id = Column(CHAR(8), doc="Game-winning RBI ID") - game_winning_rbi_name = Column(String(32), doc="Game-winning RBI name") + game_winning_rbi_name = Column(String(1024), doc="Game-winning RBI name") visitor_starting_pitcher_id = Column(CHAR(8), doc="Away team starting pitcher ID") - visitor_starting_pitcher_name = Column(String(32), doc="Away team starting pitcher name") + visitor_starting_pitcher_name = Column(String(1024), doc="Away team starting pitcher name") home_starting_pitcher_id = Column(CHAR(8), doc="Home team starting pitcher ID") - home_starting_pitcher_name = Column(String(32), doc="Home team starting pitcher name") + home_starting_pitcher_name = Column(String(1024), doc="Home team starting pitcher name") visitor_batting_1_player_id = Column(CHAR(8), doc="Away team lineup slot 1 starting player ID") - visitor_batting_1_name = Column(String(32), doc="Away team lineup slot 1 starting player name") + visitor_batting_1_name = Column(String(1024), doc="Away team lineup slot 1 starting player name") visitor_batting_1_position = Column(SmallInteger, doc="Away team lineup slot 1 starting player fielding position") visitor_batting_2_player_id = Column(CHAR(8), doc="Away team lineup slot 2 starting player ID") - visitor_batting_2_name = Column(String(32), doc="Away team lineup slot 2 starting player name") + visitor_batting_2_name = Column(String(1024), doc="Away team lineup slot 2 starting player name") visitor_batting_2_position = Column(SmallInteger, doc="Away team lineup slot 2 starting player fielding position") visitor_batting_3_player_id = Column(CHAR(8), doc="Away team lineup slot 3 starting player ID") - visitor_batting_3_name = Column(String(32), doc="Away team lineup slot 3 starting player name") + visitor_batting_3_name = Column(String(1024), doc="Away team lineup slot 3 starting player name") visitor_batting_3_position = Column(SmallInteger, doc="Away team lineup slot 3 starting player fielding position") visitor_batting_4_player_id = Column(CHAR(8), doc="Away team lineup slot 4 starting player ID") - visitor_batting_4_name = Column(String(32), doc="Away team lineup slot 4 starting player name") + visitor_batting_4_name = Column(String(1024), doc="Away team lineup slot 4 starting player name") visitor_batting_4_position = Column(SmallInteger, doc="Away team lineup slot 4 starting player fielding position") visitor_batting_5_player_id = Column(CHAR(8), doc="Away team lineup slot 5 starting player ID") - visitor_batting_5_name = Column(String(32), doc="Away team lineup slot 5 starting player name") + visitor_batting_5_name = Column(String(1024), doc="Away team lineup slot 5 starting player name") visitor_batting_5_position = Column(SmallInteger, doc="Away team lineup slot 5 starting player fielding position") visitor_batting_6_player_id = Column(CHAR(8), doc="Away team lineup slot 6 starting player ID") - visitor_batting_6_name = Column(String(32), doc="Away team lineup slot 6 starting player name") + visitor_batting_6_name = Column(String(1024), doc="Away team lineup slot 6 starting player name") visitor_batting_6_position = Column(SmallInteger, doc="Away team lineup slot 6 starting player fielding position") visitor_batting_7_player_id = Column(CHAR(8), doc="Away team lineup slot 7 starting player ID") - visitor_batting_7_name = Column(String(32), doc="Away team lineup slot 7 starting player name") + visitor_batting_7_name = Column(String(1024), doc="Away team lineup slot 7 starting player name") visitor_batting_7_position = Column(SmallInteger, doc="Away team lineup slot 7 starting player fielding position") visitor_batting_8_player_id = Column(CHAR(8), doc="Away team lineup slot 8 starting player ID") - visitor_batting_8_name = Column(String(32), doc="Away team lineup slot 8 starting player name") + visitor_batting_8_name = Column(String(1024), doc="Away team lineup slot 8 starting player name") visitor_batting_8_position = Column(SmallInteger, doc="Away team lineup slot 8 starting player fielding position") visitor_batting_9_player_id = Column(CHAR(8), doc="Away team lineup slot 9 starting player ID") - visitor_batting_9_name = Column(String(32), doc="Away team lineup slot 9 starting player name") + visitor_batting_9_name = Column(String(1024), doc="Away team lineup slot 9 starting player name") visitor_batting_9_position = Column(SmallInteger, doc="Away team lineup slot 9 starting player fielding position") home_batting_1_player_id = Column(CHAR(8), doc="Home team lineup slot 1 starting player ID") - home_batting_1_name = Column(String(32), doc="Home team lineup slot 1 starting player name") + home_batting_1_name = Column(String(1024), doc="Home team lineup slot 1 starting player name") home_batting_1_position = Column(SmallInteger, doc="Home team lineup slot 1 starting player fielding position") home_batting_2_player_id = Column(CHAR(8), doc="Home team lineup slot 2 starting player ID") - home_batting_2_name = Column(String(32), doc="Home team lineup slot 2 starting player name") + home_batting_2_name = Column(String(1024), doc="Home team lineup slot 2 starting player name") home_batting_2_position = Column(SmallInteger, doc="Home team lineup slot 2 starting player fielding position") home_batting_3_player_id = Column(CHAR(8), doc="Home team lineup slot 3 starting player ID") - home_batting_3_name = Column(String(32), doc="Home team lineup slot 3 starting player name") + home_batting_3_name = Column(String(1024), doc="Home team lineup slot 3 starting player name") home_batting_3_position = Column(SmallInteger, doc="Home team lineup slot 3 starting player fielding position") home_batting_4_player_id = Column(CHAR(8), doc="Home team lineup slot 4 starting player ID") - home_batting_4_name = Column(String(32), doc="Home team lineup slot 4 starting player name") + home_batting_4_name = Column(String(1024), doc="Home team lineup slot 4 starting player name") home_batting_4_position = Column(SmallInteger, doc="Home team lineup slot 4 starting player fielding position") home_batting_5_player_id = Column(CHAR(8), doc="Home team lineup slot 5 starting player ID") - home_batting_5_name = Column(String(32), doc="Home team lineup slot 5 starting player name") + home_batting_5_name = Column(String(1024), doc="Home team lineup slot 5 starting player name") home_batting_5_position = Column(SmallInteger, doc="Home team lineup slot 5 starting player fielding position") home_batting_6_player_id = Column(CHAR(8), doc="Home team lineup slot 6 starting player ID") - home_batting_6_name = Column(String(32), doc="Home team lineup slot 6 starting player name") + home_batting_6_name = Column(String(1024), doc="Home team lineup slot 6 starting player name") home_batting_6_position = Column(SmallInteger, doc="Home team lineup slot 6 starting player fielding position") home_batting_7_player_id = Column(CHAR(8), doc="Home team lineup slot 7 starting player ID") - home_batting_7_name = Column(String(32), doc="Home team lineup slot 7 starting player name") + home_batting_7_name = Column(String(1024), doc="Home team lineup slot 7 starting player name") home_batting_7_position = Column(SmallInteger, doc="Home team lineup slot 7 starting player fielding position") home_batting_8_player_id = Column(CHAR(8), doc="Home team lineup slot 8 starting player ID") - home_batting_8_name = Column(String(32), doc="Home team lineup slot 8 starting player name") + home_batting_8_name = Column(String(1024), doc="Home team lineup slot 8 starting player name") home_batting_8_position = Column(SmallInteger, doc="Home team lineup slot 8 starting player fielding position") home_batting_9_player_id = Column(CHAR(8), doc="Home team lineup slot 9 starting player ID") - home_batting_9_name = Column(String(32), doc="Home team lineup slot 9 starting player name") + home_batting_9_name = Column(String(1024), doc="Home team lineup slot 9 starting player name") home_batting_9_position = Column(SmallInteger, doc="Home team lineup slot 9 starting player fielding position") - additional_info = Column(String(128), doc=""" + additional_info = Column(String(1024), doc=""" Additional information. This is a grab-bag of informational items that might not warrant a field on their own. The field is alpha-numeric. Some items are represented by tokens such as: @@ -842,7 +886,7 @@ class Event(Base): outs_ct = Column(SmallInteger, doc="Outs (0-2)") balls_ct = Column(SmallInteger, doc="Balls (0-3)") strikes_ct = Column(SmallInteger, doc="Strikes (0-2") - pitch_seq_tx = Column(String(30), doc="Pitch sequence") + pitch_seq_tx = Column(String(1024), doc="Pitch sequence") away_score_ct = Column(SmallInteger, doc="Away score") home_score_ct = Column(SmallInteger, doc="Home score") bat_id = Column(CHAR(8), doc="Batter ID") @@ -864,7 +908,7 @@ class Event(Base): base1_run_id = Column(CHAR(8), doc="ID of runner on first") base2_run_id = Column(CHAR(8), doc="ID of runner on second") base3_run_id = Column(CHAR(8), doc="ID of runner on third") - event_tx = Column(String(128), doc="Event text (in scoring shorthand") + event_tx = Column(String(1024), doc="Event text (in scoring shorthand") leadoff_fl = Column(Boolean, doc="Batter is leading off the inning") ph_fl = Column(Boolean, doc="Batter is pinch-hitting") bat_fld_cd = Column(SmallInteger, doc="Defensive position of batter (10 for DH, 11 for PH, 12 for PR") @@ -885,7 +929,7 @@ class Event(Base): battedball_cd = Column(CHAR(1), doc="Batted ball code (P - pop-up, G - ground ball, F - fly ball, L - line drive") bunt_fl = Column(Boolean, doc="Event is a bunt") foul_fl = Column(Boolean, doc="Event is a foul ball") - battedball_loc_tx = Column(String(5), doc="Hit location code (see https://www.retrosheet.org/location.htm)") + battedball_loc_tx = Column(String(1024), doc="Hit location code (see https://www.retrosheet.org/location.htm)") err_ct = Column(SmallInteger, doc="Number of errors recorded during event") err1_fld_cd = Column(SmallInteger, doc="Position code of fielder committing first error during event") err1_cd = Column(CHAR(1), doc="First error type (T - throwing, F - fielding)") @@ -905,10 +949,10 @@ class Event(Base): run3_dest_id = Column(SmallInteger, doc="Destination of runner on third after event (0 - putout, 1-3 - bases, " "4 - scored as earned run, 5 - scored as unearned, 6 - scored as unearned " "to team earned to pitcher)") - bat_play_tx = Column(String(15), doc="Fielding play on batter") - run1_play_tx = Column(String(15), doc="Fielding play on runner on first") - run2_play_tx = Column(String(15), doc="Fielding play on runner on second") - run3_play_tx = Column(String(15), doc="Fielding play on runner on third") + bat_play_tx = Column(String(1024), doc="Fielding play on batter") + run1_play_tx = Column(String(1024), doc="Fielding play on runner on first") + run2_play_tx = Column(String(1024), doc="Fielding play on runner on second") + run3_play_tx = Column(String(1024), doc="Fielding play on runner on third") run1_sb_fl = Column(Boolean, doc="Runner on first steals base") run2_sb_fl = Column(Boolean, doc="Runner on second steals base") run3_sb_fl = Column(Boolean, doc="Runner on third steals base") diff --git a/transform/src/ddl_maker.py b/transform/src/ddl_maker.py index 31365c8..384499b 100644 --- a/transform/src/ddl_maker.py +++ b/transform/src/ddl_maker.py @@ -1,6 +1,6 @@ from src import OUTPUT_PATH from src.ddl_factories import all_factories -from src.schemas import all_metadata +from src.boxball_schemas import all_metadata if __name__ == "__main__": diff --git a/transform/src/parquet.py b/transform/src/parquet.py index 5f3ea1a..b2b8319 100644 --- a/transform/src/parquet.py +++ b/transform/src/parquet.py @@ -8,7 +8,7 @@ from sqlalchemy import Integer, SmallInteger, Float, String, CHAR, Text, Boolean, Date, DateTime from sqlalchemy.sql.type_api import TypeEngine -from src.schemas import all_metadata +from src.boxball_schemas import all_metadata from src import EXTRACT_PATH_PREFIX, TRANSFORM_PATH_PREFIX PARQUET_PREFIX = TRANSFORM_PATH_PREFIX.joinpath("parquet") @@ -31,6 +31,10 @@ DateTime: 'timestamp[ms]' } +def invalid_row_handler(row) -> str: + print("Error: :", row) + return "skip" + def get_fields(table: AlchemyTable) -> List[Tuple[str, str]]: cols = [(c.name, c.type) for c in table.columns.values() if c.autoincrement is not True] @@ -58,7 +62,7 @@ def get_path(prefix: Path, suffix: str): column_names = [name for name, dtype in get_fields(table)] read_options = pcsv.ReadOptions(column_names=column_names, block_size=BUFFER_SIZE_BYTES) - parse_options = pcsv.ParseOptions(newlines_in_values=True) + parse_options = pcsv.ParseOptions(newlines_in_values=True, invalid_row_handler=invalid_row_handler) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) diff --git a/transform/src/schemas/__init__.py b/transform/src/schemas/__init__.py deleted file mode 100644 index 789dd6a..0000000 --- a/transform/src/schemas/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from typing import List -from sqlalchemy import MetaData -from src.schemas.retrosheet import metadata as retrosheet_metadata -from src.schemas.baseballdatabank import metadata as baseballdatabank_metadata - -all_metadata: List[MetaData] = [baseballdatabank_metadata, retrosheet_metadata] diff --git a/transform/src/setup.py b/transform/src/setup.py new file mode 100644 index 0000000..0b84022 --- /dev/null +++ b/transform/src/setup.py @@ -0,0 +1,12 @@ +import setuptools + +setuptools.setup( + name="boxball-schemas", + version="0.0.8", + author="David Roher", + description="Schemas for the Boxball project that can be used for extension or code completion", + url="https://github.com/droher/boxball", + packages=["boxball_schemas"], + python_requires='>=3.6', + install_requires=["sqlalchemy>=1.3.3"] +)