-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
83 lines (72 loc) · 2.38 KB
/
Copy pathparser.py
File metadata and controls
83 lines (72 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
from pathlib import Path
from typing import List, Dict, Union
from urllib.request import urlopen
from datetime import datetime
import html2text
from patterns import (
REPORT_DATE,
REQUESTED,
RETURNED,
BIG_NUMBER,
DATE,
REJECTED,
IN_PERSON,
TOTAL,
TURNOUT,
PERCENT,
)
from unmark import unmark
here = Path(__file__).parent.resolve()
def convert_html_to_plain_text(filepath: Union[str, Path]) -> List[str]:
uri = filepath.as_uri()
doc = urlopen(url=uri).read().decode() # nosec
return [
unmark(line.strip().lower()) for line in html2text.html2text(doc).splitlines()
]
def parser(path: Union[str, Path] = here.resolve() / "tmp.html") -> Dict:
record = {}
for line in convert_html_to_plain_text(path):
if REPORT_DATE.findall(line):
for match in DATE.findall(line):
try:
record["date"] = (
datetime.strptime(match, "%m/%d/%Y").date().isoformat()
)
except ValueError:
record["date"] = match
elif RETURNED.findall(line):
for match in BIG_NUMBER.findall(line):
record["returned"] = int(match.replace(",", ""))
elif REQUESTED.findall(line):
for match in BIG_NUMBER.findall(line):
record["requested"] = int(match.replace(",", ""))
elif REJECTED.findall(line):
for match in BIG_NUMBER.findall(line):
record["rejected"] = int(match.replace(",", ""))
elif IN_PERSON.findall(line):
for match in BIG_NUMBER.findall(line):
record["in-person"] = int(match.replace(",", ""))
elif TOTAL.findall(line):
for match in BIG_NUMBER.findall(line):
record["total"] = int(match.replace(",", ""))
elif TURNOUT.findall(line):
for match in PERCENT.findall(line):
record["turnout-rate"] = float(match)
if record:
locality = os.getenv("STATE", "")
if locality == "index":
locality = "US"
record["locality"] = locality
return record
def main():
data = parser()
if data:
with open(here / "tmp.jsonl", "a") as fh:
json.dump(data, fh)
fh.write("\n")
if __name__ == "__main__":
main()