Skip to content

Commit

Permalink
Merge branch 'composite-feature-multi' into 'master'
Browse files Browse the repository at this point in the history
Composite feature multi base

See merge request lbsn/lbsntransform!13
  • Loading branch information
Sieboldianus committed Nov 23, 2022
2 parents 1bc1974 + 60da223 commit a612b4e
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/mappings.md
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ Bases can be separated by comma and may include:
- `_emoji_latlng`
- `_month_latlng`
- `_month_hashtag`
- `_month_hashtag_latlng`


For example:
Expand Down
1 change: 1 addition & 0 deletions lbsntransform/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,7 @@ def get_arg_parser(
'* _term_latlng '
'* _emoji_latlng '
'* _month_hashtag '
'* _month_hashtag_latlng '
'* _month_latlng '
'* monthofyear '
'* month '
Expand Down
53 changes: 52 additions & 1 deletion lbsntransform/output/hll/base/temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

FACET = 'temporal'


class TimestampBase(hll.HllBase):
"""Extends Base Class"""
NAME = hll.HllBaseRef(facet=FACET, base='timestamp')
Expand Down Expand Up @@ -183,6 +184,7 @@ def __init__(self, record: lbsn.Post = None):
if post_date_time:
self.key['monthofyear'] = post_date_time.month


class MonthHashtagBase(hll.HllBase):
"""Composite Base (c-base) that extends from HLL base Class
Expand Down Expand Up @@ -215,6 +217,7 @@ def __init__(self, record: lbsn.Post = None, hashtag: str = None):
"Parsing of MonthHashtagBase only supported "
"from lbsn.Post")


class MonthLatLngBase(hll.HllBase):
"""Composite Base (c-base) that extends from HLL base Class
Expand Down Expand Up @@ -255,4 +258,52 @@ def __init__(self, record: lbsn.Post = None):
else:
raise ValueError(
"Parsing of MonthLatLngBase only supported "
"from lbsn.Post")
"from lbsn.Post")


class MonthHashtagLatLngBase(hll.HllBase):
"""Composite Base (c-base) that extends from hll.HllBase Class
Note: To distinguish c-bases which are composite bases combining
aspects from multiple facets, they're termed with a leading underscore
"""
NAME = hll.HllBaseRef(facet=FACET, base='_month_hashtag_latlng')

def __init__(self, record: lbsn.Post = None, hashtag: str = None):

super().__init__()
self.key['year'] = None
self.key['month'] = None
self.key['hashtag'] = None
self.key['latitude'] = None
self.key['longitude'] = None
self.attrs['latlng_geom'] = None
if hashtag is None:
# init empty
return
self.key['hashtag'] = hashtag.lower()

if record is None:
# init empty
return

if not isinstance(record, lbsn.Post):
raise ValueError(
"Parsing of MonthHashtagLatLngBase only supported "
"from lbsn.Post")
post_date_time = HLF.merge_dates_post(
record)
if post_date_time:
date = post_date_time.date()
self.key['year'] = date.year
self.key['month'] = date.month
coordinates_geom = record.post_latlng
coordinates = HF.get_coordinates_from_ewkt(
coordinates_geom
)
self.key['latitude'] = coordinates.lat
self.key['longitude'] = coordinates.lng
# additional (optional) attributes
# formatted ready for sql upsert
self.attrs['latlng_geom'] = HF.return_ewkb_from_geotext(
coordinates_geom)
8 changes: 8 additions & 0 deletions lbsntransform/output/hll/hll_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def merge_base_metrics(base1, base2):
continue
base1.metrics[key] |= new_set


def append_baserecord(
base_records: List['HllBase'], base_record: 'HllBase'):
"""Append base_record to list, if all keys have valid values (not None)"""
Expand All @@ -124,6 +125,7 @@ def append_baserecord(
return
base_records.append(base_record)


def base_factory(facet=None, base=None, record: lbsn.Post = None):
"""Base is initialized based on facet-base tuple
and constructed by parsing lbsn records
Expand Down Expand Up @@ -194,6 +196,12 @@ def base_factory(facet=None, base=None, record: lbsn.Post = None):
for tag in tag_terms:
base_record = base_structure(record=record, hashtag=tag)
append_baserecord(records, base_record)
elif base == '_month_hashtag_latlng':
# any hashtag explicitly used
tag_terms = HF.filter_terms(record.hashtags)
for tag in tag_terms:
base_record = base_structure(record=record, hashtag=tag)
append_baserecord(records, base_record)
else:
# init for all other bases with single lbsn record
base_record = base_structure(record)
Expand Down
6 changes: 4 additions & 2 deletions lbsntransform/output/hll/shared_structure_proto_hlldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
spatial.CityBase, spatial.CountryBase,
temporal.DateBase, temporal.MonthBase,
temporal.YearBase, temporal.MonthLatLngBase,
temporal.MonthHashtagBase, topical.TermBase,
temporal.MonthHashtagBase,
temporal.MonthHashtagLatLngBase, topical.TermBase,
topical.EmojiBase, topical.TermLatLngBase,
topical.HashtagLatLngBase, topical.EmojiLatLngBase,
social.CommunityBase]
Expand Down Expand Up @@ -136,7 +137,8 @@ def extract_hll_bases(
base_list.extend(base_records)
# Temporal Facet
temporal_bases = [
'date', 'month', 'year', '_month_latlng', '_month_hashtag']
'date', 'month', 'year', '_month_latlng', '_month_hashtag',
'_month_hashtag_latlng']
temporal_bases = self.filter_bases(
temporal_bases, include_lbsn_bases)
base_records = self.make_bases(
Expand Down
1 change: 1 addition & 0 deletions lbsntransform/output/submit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def __init__(self, db_cursor=None,
temporal.YearBase.NAME: dict(),
temporal.MonthLatLngBase.NAME: dict(),
temporal.MonthHashtagBase.NAME: dict(),
temporal.MonthHashtagLatLngBase.NAME: dict(),
topical.TermBase.NAME: dict(),
topical.HashtagBase.NAME: dict(),
topical.EmojiBase.NAME: dict(),
Expand Down
21 changes: 13 additions & 8 deletions lbsntransform/tools/helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,30 @@
from google.protobuf.internal.containers import \
ScalarMap # pylint: disable=no-name-in-module

NLTK_AVAIL = True
NLTK_AVAIL = None
STOPWORDS = None
try:
# check if nltk is installed
import nltk
NLTK_AVAIL = True
except ImportError:
NLTK_AVAIL = False
pass

if NLTK_AVAIL:
try:
# check if stopwords corpus is available
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
except LookupError:
print(
'Please use '
'`python -c \'import nltk;nltk.download("stopwords")\'` '
'to install stopwords resource globally. Continuing without '
'nltk stopwords filter..')
NLTK_AVAIL = False
STOPWORDS = None
# pylint: disable=no-member


class HelperFunctions():
"""Collection of helper functions being used in lbsntransform package"""

Expand Down Expand Up @@ -174,10 +179,10 @@ def filter_terms(
return resultwords

@staticmethod
def nltk_stopword_filter(term: str) -> bool:
def nltk_stopword_filter(term: str, nltk_avail=NLTK_AVAIL, stopwords=STOPWORDS) -> bool:
"""Filter term against nltk stopwords (english)"""
if NLTK_AVAIL:
if term in stopwords.words('english'):
if nltk_avail is not None and stopwords is not None:
if term in stopwords:
return False
return True

Expand Down Expand Up @@ -224,7 +229,7 @@ def get_coordinates_from_ewkt(

@staticmethod
def extract_hashtags_from_string(
text_str: str) -> Set[str]:
text_str: str) -> Set[str]:
"""Extract hashtags with leading hash-character (#) from string
- removes # from hashtags
Expand All @@ -239,7 +244,7 @@ def extract_hashtags_from_string(

@staticmethod
def extract_atmentions_from_string(
text_str: str) -> Set[str]:
text_str: str) -> Set[str]:
"""Extract @-mentions with leading hash-character (@) from string
- removes @ from mentions
Expand Down

0 comments on commit a612b4e

Please sign in to comment.