|
| 1 | +#!/usr/bin/env perl |
| 2 | + |
| 3 | +use strict; |
| 4 | +use Geo::Hash; |
| 5 | + |
| 6 | +my $num_to_find = 100_000_000; |
| 7 | + |
| 8 | +my $col_sep = "<"; # Illegal in the XML, so safe to use as delimiter |
| 9 | +my @col_names = qw( id timestamp uid lat lon name key_value ); |
| 10 | +my %cols = (); |
| 11 | +$/ = "</node>\n"; |
| 12 | + |
| 13 | +sub reset_cols { |
| 14 | + %cols = map { $_ => undef } @col_names; |
| 15 | +} |
| 16 | + |
| 17 | +# - Get a line, a "...</node>", look for leading "<node [^/]+?>", parse it. |
| 18 | +# - Split the remainder on the "\n", looking at each "<tag ..>", to fill %col. |
| 19 | +# - Only print the result if the place has a name. |
| 20 | +# - Any other "<tag ..." data will be placed in final column, "key_value": k1=v1|k2=v2|... |
| 21 | + |
| 22 | +reset_cols(); |
| 23 | +my $num_found = 0; |
| 24 | +my $gh = Geo::Hash->new; |
| 25 | +while (<>) |
| 26 | +{ |
| 27 | + last if $num_found >= $num_to_find; |
| 28 | + my $geohash = ""; |
| 29 | + if (/<node +id="(\d+)" +version="\d+" +timestamp="([^"]+)" +uid="(\d+)" +user="[^"]+" +changeset="\d+" +lat="([^"]+)" +lon="([^"]+)">/) { |
| 30 | + $cols{id} = $1; |
| 31 | + $cols{timestamp} = $2; |
| 32 | + $cols{uid} = $3; |
| 33 | + $cols{lat} = $4; |
| 34 | + $cols{lon} = $5; |
| 35 | + my $lat_lon = join(',', $4, $5); |
| 36 | + $geohash = $gh->encode($cols{lat}, $cols{lon}); |
| 37 | + |
| 38 | + my @key_value = (); |
| 39 | + foreach my $tag (split /\n/) |
| 40 | + { |
| 41 | + if ($tag =~ m~<tag +k="([^"]+)" v="([^"]+)"\s*/>~) { |
| 42 | + if (exists $cols{$1}) { |
| 43 | + $cols{$1} = $2; |
| 44 | + } else { |
| 45 | + (my $k = $1) =~ s/[\|=]/~/g; |
| 46 | + (my $v = $2) =~ s/[\|=]/~/g; |
| 47 | + push @key_value, $k . "=" . $v; |
| 48 | + } |
| 49 | + } |
| 50 | + } |
| 51 | + # Append a few geohash substrings |
| 52 | + for (my $i = 3; $i < 7; $i++) |
| 53 | + { |
| 54 | + push(@key_value, substr($geohash, 0, $i)); |
| 55 | + } |
| 56 | + $cols{key_value} = join("|", @key_value); |
| 57 | + } |
| 58 | + if ($cols{name}) { |
| 59 | + $num_found += 1; |
| 60 | + print join($col_sep, @cols{@col_names}, $geohash) . "\n"; |
| 61 | + print STDERR "N rows: $num_found\n" if $num_found % 1000 == 0; |
| 62 | + } |
| 63 | + reset_cols(); |
| 64 | +} |
| 65 | + |
| 66 | +__DATA__ |
| 67 | +<node id="271251" version="4" timestamp="2009-09-09T22:16:06Z" uid="169366" user="Tunafish" changeset="2430548" lat="50.8052" lon="-1.67253"> |
| 68 | + <tag k="name" v="Station House"/> |
| 69 | + <tag k="amenity" v="restaurant"/> |
| 70 | + <tag k="cuisine" v="tea;restaurant"/> |
| 71 | +</node> |
| 72 | +<node id="4082701" version="4" timestamp="2013-10-22T06:40:09Z" uid="453141" user="ppr9" changeset="18480867" lat="52.1602045" lon="-0.4921953"> |
| 73 | + <tag k="name" v="Bellini's"/> |
| 74 | + <tag k="amenity" v="restaurant"/> |
| 75 | + <tag k="wheelchair" v="yes"/> |
| 76 | + <tag k="addr:street" v="High Street"/> |
| 77 | + <tag k="addr:postcode" v="MK41 6EG"/> |
| 78 | + <tag k="addr:housenumber" v="44,46"/> |
| 79 | +</node> |
| 80 | +
|
0 commit comments