Skip to content

Commit

Permalink
Allow regexclass ranges (#4)
Browse files Browse the repository at this point in the history
The regexmodel now is able to parse and estimate regex class ranges such as [A-F], [g-s], [4-7]. This should also result in small improvements in performance.
  • Loading branch information
qubixes authored Sep 25, 2023
1 parent d15e314 commit 17530fc
Show file tree
Hide file tree
Showing 22 changed files with 556 additions and 320 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install ".[test,tutorial]"
- name: Check pep8 with flake8
- name: Lint with ruff
run: |
flake8 regexmodel --max-line-length 100
ruff regexmodel
- name: Lint with pylint
run: |
pylint regexmodel
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
regexmodel/_version.py

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -161,4 +162,4 @@ cython_debug/

# pyenv

.python-version
.python-version
34 changes: 17 additions & 17 deletions benchmarks/accurate/avg_log_like_pc_success.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
faker_type,nl,fr,en,de,da
average,-2.1915454889405193,-2.085692682375947,-2.18280418271571,-2.1232970908249675,-2.096331377349144
address,-2.8283862849587535,-1.4533890029216738,-2.0336731390272353,-1.925900279810606,-2.759496162520807
phone_number,-1.7688013889196164,-1.5607712942902165,-2.1163970594677974,-1.749678224572948,-2.001020843204409
average,-2.608543390685392,-2.5540763210301898,-2.5856597956220946,-2.5692086137569947,-2.422232885238715
address,-3.18723924481244,-3.887090673243783,-2.785144772943876,-3.566546641315572,-3.131905640139533
phone_number,-2.420385889036475,-1.5785009699055097,-2.2114432381512685,-2.1368741178333464,-2.049223740657882
pricetag,-1.7923231995145659,-1.5760340411557583,-1.7923231995145659,-1.5760340411557583,-1.2696132934896613
timezone,-1.897851672904426,-1.897851672904426,-1.897851672904426,-1.897851672904426,-1.897851672904426
mime_type,-2.494969163134166,-2.494969163134166,-2.494969163134166,-2.494969163134166,-2.494969163134166
unix_partition,-0.8128951812359995,-0.8128951812359995,-0.8128951812359995,-0.8128951812359995,-0.8128951812359995
ascii_email,-2.8487401923062,-2.7446657828270458,-2.806944919984864,-2.8093778066649575,-2.7801893522322323
isbn10,-1.885582753251952,-1.885582753251952,-1.885582753251952,-1.885582753251952,-1.885582753251952
job,-3.2825423498046944,-3.3197176489230835,-3.2825423498046944,-3.512404371144477,-3.60416826076485
timezone,-2.968113041245333,-2.968113041245333,-2.968113041245333,-2.968113041245333,-2.968113041245333
mime_type,-3.214729512361329,-3.214729512361329,-3.214729512361329,-3.214729512361329,-3.214729512361329
unix_partition,-1.0229921882996647,-1.0229921882996647,-1.0229921882996647,-1.0229921882996647,-1.0229921882996647
ascii_email,-2.932696888699675,-3.086622584650391,-2.848214128255551,-3.1044451170140266,-2.8450015512821145
isbn10,-1.9090496820227958,-1.9090496820227958,-1.9090496820227958,-1.9090496820227958,-1.9090496820227958
job,-3.652449473053463,-3.6658597233362515,-3.652449473053463,-3.5153069329301756,-3.576063228123199
ssn,-2.302585092994053,-2.3173140261754446,-1.883933257904226,-1.883933257904226,-1.883933257904226
user_agent,-0.2602482506708753,-0.2602482506708753,-0.2602482506708753,-0.2602482506708753,-0.2602482506708753
color,-3.07863311203932,-3.07863311203932,-3.07863311203932,-3.07863311203932,-3.07863311203932
license_plate,-2.470697252907526,-2.2421505394955856,-3.256390150685296,-2.66214645531138,-2.0032353934459195
user_agent,-3.0023386522052933,-3.0023386522052933,-3.0023386522052933,-3.0023386522052933,-3.0023386522052933
color,-2.4807670754333406,-2.4807670754333406,-2.4807670754333406,-2.4807670754333406,-2.4807670754333406
license_plate,-2.795026079321331,-2.3157898133188928,-3.918688389538393,-2.6225242552034,-2.0032353934459195
iban,-2.2590781815563674,-2.1320232342537286,-2.266988529090495,-2.0932591754491296,-2.072326583694648
company,-2.955044174105518,-3.0906967602544735,-2.7090592212311626,-3.070200644452302,-2.135437650751123
time,-1.7503697086472485,-1.7503697086472485,-1.7503697086472485,-1.7503697086472485,-1.7503697086472485
ipv4,-2.078159173483116,-2.078159173483116,-2.078159173483116,-2.078159173483116,-2.078159173483116
uri,-1.6733102367998156,-1.745111628166589,-1.74863506343372,-1.723981947398809,-1.7329466523265975
name,-3.1991469206356564,-3.187577991312297,-3.3176835660873323,-3.077019506442681,-3.329219703932165
company,-3.7194810482582334,-3.126868525309265,-3.111995068243158,-3.5006759708568267,-2.5272771610635956
time,-1.736774924512717,-1.736774924512717,-1.736774924512717,-1.736774924512717,-1.736774924512717
ipv4,-2.078583243995832,-2.078583243995832,-2.078583243995832,-2.078583243995832,-2.078583243995832
uri,-2.89706171157613,-2.898724633175328,-2.925111627052448,-2.912835039637672,-2.917546034331703
name,-3.1906492941234186,-3.5292735549729537,-3.3178961129960527,-3.489180792006465,-3.3329506153268027
34 changes: 17 additions & 17 deletions benchmarks/accurate/avg_log_like_per_char.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
faker_type,nl,fr,en,de,da
average,-3.2938598343934418,-3.3555745632366025,-3.316232988868926,-3.270966353500831,-3.1062455436565832
address,-3.834188626954746,-6.302684528850788,-4.176094603259543,-5.305111907631831,-4.555830173227214
phone_number,-2.2295751913746455,-1.6323777437240714,-3.044748889264162,-2.416276680580892,-2.2073134372665324
average,-3.235088778489685,-3.2739489408299,-3.235365149011781,-3.1794819021421303,-3.0369287434358316
address,-3.840148579218316,-6.082348486264795,-3.703915298159491,-5.079471629783415,-4.642334566045615
phone_number,-2.519394413249677,-1.6605009305061085,-3.06802094516657,-2.4578177002378103,-2.250806260046024
pricetag,-1.857589571606867,-1.644179768981727,-1.857589571606867,-1.644179768981727,-1.3418765920395992
timezone,-3.557125384790429,-3.557125384790429,-3.557125384790429,-3.557125384790429,-3.557125384790429
mime_type,-4.309785403364651,-4.309785403364651,-4.309785403364651,-4.309785403364651,-4.309785403364651
unix_partition,-0.8128951812359995,-0.8128951812359995,-0.8128951812359995,-0.8128951812359995,-0.8128951812359995
ascii_email,-3.955438091483059,-3.6149673949775107,-3.6707139359652112,-3.5748414604300818,-3.4054001931982816
isbn10,-2.406411205324038,-2.406411205324038,-2.406411205324038,-2.406411205324038,-2.406411205324038
job,-4.884924919736532,-5.705984913662813,-4.884924919736532,-3.6686003606586874,-4.595367802147032
timezone,-3.5919927903318856,-3.5919927903318856,-3.5919927903318856,-3.5919927903318856,-3.5919927903318856
mime_type,-3.92647728939957,-3.92647728939957,-3.92647728939957,-3.92647728939957,-3.92647728939957
unix_partition,-1.0229921882996647,-1.0229921882996647,-1.0229921882996647,-1.0229921882996647,-1.0229921882996647
ascii_email,-3.9691136015503687,-3.582655346564504,-3.6927285228324136,-3.547302937885057,-3.450809295007938
isbn10,-2.1910620580931046,-2.1910620580931046,-2.1910620580931046,-2.1910620580931046,-2.1910620580931046
job,-4.838872161824781,-5.701827690566834,-4.838872161824781,-3.6669438767303433,-4.563536384448202
ssn,-2.302585092994053,-2.3173140261754446,-1.883933257904226,-1.883933257904226,-1.883933257904226
user_agent,-5.6441974842594185,-5.6441974842594185,-5.6441974842594185,-5.6441974842594185,-5.6441974842594185
color,-3.07863311203932,-3.07863311203932,-3.07863311203932,-3.07863311203932,-3.07863311203932
license_plate,-2.6478583661828106,-2.2421505394955856,-4.165539752656556,-2.884306282078523,-2.0032353934459195
user_agent,-5.240401175100682,-5.240401175100682,-5.240401175100682,-5.240401175100682,-5.240401175100682
color,-2.4807670754333406,-2.4807670754333406,-2.4807670754333406,-2.4807670754333406,-2.4807670754333406
license_plate,-2.9795090041929244,-2.3157898133188928,-4.440001467355454,-2.838396764558982,-2.0032353934459195
iban,-2.2590781815563674,-2.1320232342537286,-2.266988529090495,-2.0932591754491296,-2.072326583694648
company,-4.934710446847975,-4.300955126886775,-3.8023309055353445,-4.852974952335046,-3.340791390626175
time,-1.7503697086472485,-1.7503697086472485,-1.7503697086472485,-1.7503697086472485,-1.7503697086472485
ipv4,-2.26972756360758,-2.26972756360758,-2.26972756360758,-2.26972756360758,-2.26972756360758
uri,-5.067197884807189,-5.04363112749814,-5.582602500792118,-5.04202573068279,-5.21323789991221
name,-4.781045436662459,-4.9905032537201865,-3.8438148794298477,-4.953706096514169,-4.5702075627445655
company,-4.8499532026595595,-4.211619416906417,-3.805334120230944,-4.725582855110774,-3.288857834461055
time,-1.872908543697085,-1.872908543697085,-1.872908543697085,-1.872908543697085,-1.872908543697085
ipv4,-2.2634526769570096,-2.2634526769570096,-2.2634526769570096,-2.2634526769570096,-2.2634526769570096
uri,-4.857727899651415,-5.031836748733662,-5.489040615715497,-4.849665855189024,-5.059584335510611
name,-4.602661285487342,-4.9348806161836505,-3.835459544024767,-5.03354852155765,-4.554291825364615
2 changes: 1 addition & 1 deletion benchmarks/accurate/benchmark.json

Large diffs are not rendered by default.

40 changes: 20 additions & 20 deletions benchmarks/accurate/fit_time.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
faker_type,nl,fr,en,de,da
average,1.0345648144420825,1.2123727422011528,1.0417345015626207,1.045587078521126,0.9813138428487278
address,0.5851861000061035,3.190283405780792,1.179566252231598,1.620183265209198,1.0900044083595275
phone_number,0.27152488231658933,0.31640344858169556,0.3707251787185669,0.4781836748123169,0.12308040857315064
pricetag,0.09918105602264404,0.10968753099441528,0.09890204668045044,0.10601838827133178,0.12361856698989868
timezone,0.7945192575454711,0.7907622814178467,0.7977358102798462,0.7928660154342652,0.794252622127533
mime_type,0.6416023373603821,0.626270866394043,0.6598812699317932,0.6560978055000305,0.6223598003387452
unix_partition,0.1689455986022949,0.16861530542373657,0.16406331062316895,0.16328907012939453,0.1622081756591797
ascii_email,0.3578091263771057,0.2798264503479004,0.2519848823547363,0.2195643186569214,0.15801957845687867
isbn10,0.14612566232681273,0.1527639627456665,0.1379132866859436,0.13787614107131957,0.14406723976135255
job,0.727282977104187,1.6923543453216552,0.7106510639190674,0.06669650077819825,0.5871858239173889
ssn,0.012790465354919433,0.06312897205352783,0.04731178283691406,0.0453626275062561,0.043325531482696536
user_agent,12.596259105205537,12.546485209465027,12.668293738365174,12.390237605571746,12.43778133392334
color,0.149095618724823,0.14120876789093018,0.14178090095520018,0.147017240524292,0.1418485164642334
license_plate,0.1619734764099121,0.11599844694137573,0.28234190940856935,0.10648893117904663,0.048722708225250246
iban,0.05423048734664917,0.03062647581100464,0.056125521659851074,0.02893441915512085,0.02854464054107666
company,0.860711669921875,0.4471618175506592,0.2821364879608154,0.8930826306343078,0.39163488149642944
time,0.07157649993896484,0.07681479454040527,0.07187584638595582,0.07299824953079223,0.07291327714920044
ipv4,0.058012306690216064,0.0574945330619812,0.05647341012954712,0.05602554082870483,0.05796483755111694
uri,1.1566624999046327,1.3994098067283631,1.661333680152893,1.0549142003059386,1.246828019618988
name,0.743242347240448,0.829785680770874,0.1538591504096985,0.8303178668022155,0.3706026434898376
average,1.0539685337167037,1.1724900741326183,1.0449505680485773,1.0116366750315617,0.9894646123835915
address,0.3905096411705017,2.4048934102058412,0.88459233045578,1.37061265707016,1.1492964386940003
phone_number,0.2472851037979126,0.31745070219039917,0.31231064796447755,0.45900691747665406,0.11690607070922851
pricetag,0.09145452976226806,0.09916337728500366,0.09132636785507202,0.09569346904754639,0.10785114765167236
timezone,0.36651819944381714,0.3723091959953308,0.36817280054092405,0.36243308782577516,0.3656641602516174
mime_type,0.3960761785507202,0.39853849411010744,0.39532947540283203,0.39961127042770384,0.39510915279388426
unix_partition,0.15677225589752197,0.16830892562866212,0.1568874955177307,0.1540762662887573,0.157298743724823
ascii_email,0.3711480975151062,0.4466110587120056,0.3135540246963501,0.38162081241607665,0.2586862564086914
isbn10,0.07403323650360108,0.072108793258667,0.07242581844329835,0.07177988290786744,0.07382155656814575
job,0.5449542999267578,1.6180486798286438,0.5358052730560303,0.058674752712249756,0.5220331430435181
ssn,0.011691272258758545,0.020928192138671874,0.04452129602432251,0.04437443017959595,0.04361436367034912
user_agent,13.337422728538513,13.322584187984466,13.204304611682891,12.808983647823334,13.13469854593277
color,0.08022589683532715,0.07984303236007691,0.07631831169128418,0.07980431318283081,0.07828595638275146
license_plate,0.8959861516952514,0.2581357717514038,1.2845681667327882,0.086572265625,0.046995580196380615
iban,0.050638020038604736,0.028040647506713867,0.053077518939971924,0.027474796772003172,0.026432621479034423
company,0.8615628361701966,0.49370708465576174,0.3043045163154602,0.9282056212425231,0.43524144887924193
time,0.16923638582229614,0.17327781915664672,0.1716768503189087,0.1747194766998291,0.16687155961990358
ipv4,0.06315799951553344,0.062369561195373534,0.061660492420196535,0.0629740834236145,0.06114859580993652
uri,1.0409641027450562,1.164347732067108,1.3814642071723937,0.7627432703971863,1.0739984512329102
name,0.8757652044296265,0.7766447424888611,0.14176058769226074,0.8917358040809631,0.5858738422393799
36 changes: 18 additions & 18 deletions benchmarks/accurate/n_parameters.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
faker_type,nl,fr,en,de,da
average,69.73684210526316,76.78157894736842,73.71052631578948,73.21315789473684,67.96578947368421
address,23.0,106.2,89.1,51.65,50.25
phone_number,38.5,56.55,46.35,51.05,20.35
average,84.06315789473683,90.3921052631579,85.9578947368421,86.10526315789475,80.76052631578946
address,28.15,104.1,84.7,66.6,57.25
phone_number,35.8,51.65,46.2,52.85,20.6
pricetag,10.0,11.0,10.0,11.0,13.0
timezone,67.55,67.55,67.55,67.55,67.55
mime_type,63.95,63.95,63.95,63.95,63.95
unix_partition,23.0,23.0,23.0,23.0,23.0
ascii_email,26.8,29.8,20.7,22.4,20.85
isbn10,23.15,23.15,23.15,23.15,23.15
job,30.95,55.5,30.95,4.95,26.15
ssn,2.0,4.0,6.0,6.0,6.0
user_agent,817.05,817.05,817.05,817.05,817.05
color,4.3,4.3,4.3,4.3,4.3
license_plate,21.85,14.0,35.9,9.6,6.0
timezone,44.95,44.95,44.95,44.95,44.95
mime_type,48.85,48.85,48.85,48.85,48.85
unix_partition,15.0,15.0,15.0,15.0,15.0
ascii_email,30.45,34.85,23.7,28.25,21.55
isbn10,11.7,11.7,11.7,11.7,11.7
job,36.1,77.2,36.1,5.45,30.95
ssn,2.0,3.0,6.0,6.0,6.0
user_agent,1058.6,1058.6,1058.6,1058.6,1058.6
color,4.0,4.0,4.0,4.0,4.0
license_plate,45.55,18.15,55.3,11.2,6.0
iban,6.0,4.0,6.0,4.0,4.0
company,53.0,48.2,44.65,89.5,39.05
time,8.2,8.2,8.2,8.2,8.2
ipv4,8.0,8.0,8.0,8.0,8.0
uri,63.0,63.35,87.65,77.9,59.75
name,34.7,51.05,8.0,47.8,30.75
company,60.75,57.4,42.1,89.5,47.4
time,27.75,27.75,27.75,27.75,27.75
ipv4,8.2,8.2,8.2,8.2,8.2
uri,75.6,77.45,94.6,83.2,76.1
name,47.75,59.6,9.45,58.9,32.55
40 changes: 20 additions & 20 deletions benchmarks/accurate/statistics_time.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
faker_type,nl,fr,en,de,da
average,0.23670234868400977,0.19632002240733099,0.20751905378542448,0.1909523047898945,0.2666338274353429
address,0.5640411972999573,0.08103983402252198,0.35432752370834353,0.5469830274581909,1.5645005583763123
phone_number,0.18356084823608398,0.27067837715148924,0.22043415307998657,0.3016166687011719,0.22791289091110228
pricetag,0.11802563667297364,0.10484408140182495,0.11929800510406494,0.12383359670639038,0.11443854570388794
timezone,0.20284318923950195,0.2001940965652466,0.19879117012023925,0.19469419717788697,0.20013341903686524
mime_type,0.2630094051361084,0.2680829882621765,0.25700788497924804,0.2564225435256958,0.28691624402999877
unix_partition,0.12089968919754028,0.12026575803756714,0.11958647966384887,0.09764676094055176,0.09850943088531494
ascii_email,0.4777830719947815,0.45578391551971437,0.3580363392829895,0.4442672967910767,0.35654832124710084
isbn10,0.1591336965560913,0.15549232959747314,0.13595470190048217,0.13643981218338014,0.14882111549377441
job,0.895618736743927,0.5545249700546264,0.8716941118240357,0.1670621395111084,0.933614456653595
ssn,0.03439621925354004,0.09856067895889283,0.06396838426589965,0.0661852478981018,0.07760518789291382
user_agent,0.0742943525314331,0.0760420322418213,0.07422363758087158,0.07444602251052856,0.07493953704833985
color,0.036913824081420896,0.043144619464874266,0.03863755464553833,0.03617870807647705,0.03782203197479248
license_plate,0.12433426380157471,0.083889901638031,0.2951118588447571,0.14747953414916992,0.06802377700805665
iban,0.06942131519317626,0.046972906589508055,0.07641474008560181,0.04523591995239258,0.041746675968170166
company,0.3531560659408569,0.4767865777015686,0.16467362642288208,0.31743320226669314,0.22107183933258057
time,0.08106378316879273,0.06422587633132934,0.07434871196746826,0.06832236051559448,0.06625701189041137
ipv4,0.17699612379074098,0.1620609164237976,0.1704077959060669,0.19188920259475709,0.16540493965148925
uri,0.21172951459884642,0.1848188042640686,0.16189446449279785,0.2125089168548584,0.20953937768936157
name,0.3501236915588379,0.28267176151275636,0.18805087804794313,0.19944863319396972,0.1722373604774475
average,0.3725274939286081,0.35760665190847296,0.3541105922899749,0.3333816998883297,0.3658875609699049
address,0.3507387042045593,0.5237283706665039,0.306281316280365,0.5258392691612244,0.7479472994804383
phone_number,0.13483427762985228,0.17085692882537842,0.14457722902297973,0.2041498303413391,0.15075273513793946
pricetag,0.0670514702796936,0.07024327516555787,0.06944836378097534,0.06762607097625732,0.07765616178512573
timezone,0.17192832231521607,0.16927993297576904,0.17377980947494506,0.1745126485824585,0.18833197355270387
mime_type,0.19353278875350952,0.19230760335922242,0.1960466980934143,0.18899186849594116,0.19149105548858641
unix_partition,0.07950320243835449,0.08038326501846313,0.07492446899414062,0.07483541965484619,0.07768925428390502
ascii_email,0.3820098161697388,0.2919479846954346,0.293787682056427,0.2775312542915344,0.3095932364463806
isbn10,0.08862709999084473,0.09146804809570312,0.08646180629730224,0.09172695875167847,0.09234809875488281
job,0.8401842474937439,0.5237451791763306,0.7228711843490601,0.09164756536483765,0.4624874234199524
ssn,0.02109973430633545,0.05699102878570557,0.040679645538330075,0.04134832620620728,0.04376267194747925
user_agent,3.5394612431526182,3.5141764760017393,3.5784319758415224,3.4911162495613097,3.574007034301758
color,0.022504639625549317,0.023080503940582274,0.02483234405517578,0.02321392297744751,0.024385404586791993
license_plate,0.11899588108062745,0.06640117168426514,0.16523100137710572,0.09924142360687256,0.044054734706878665
iban,0.04013749361038208,0.02795299291610718,0.03892586231231689,0.02615940570831299,0.026592910289764404
company,0.2860146164894104,0.24908096790313722,0.17642858028411865,0.28308495283126833,0.1593705415725708
time,0.12270145416259766,0.12008458375930786,0.11647778749465942,0.11584010124206542,0.11876322031021118
ipv4,0.09932031631469726,0.10533849000930787,0.10943217277526855,0.09472846984863281,0.11898685693740844
uri,0.3120978355407715,0.29120917320251466,0.28321017026901246,0.2415168046951294,0.33221733570098877
name,0.2072792410850525,0.22625041007995605,0.12627315521240234,0.2211417555809021,0.21142570972442626
Loading

0 comments on commit 17530fc

Please sign in to comment.