diff --git a/scripts/bioguide.py b/scripts/bioguide.py index 45ed77f14..6199511ff 100755 --- a/scripts/bioguide.py +++ b/scripts/bioguide.py @@ -1,15 +1,10 @@ #!/usr/bin/env python -# gets fundamental information for every member with a bioguide ID: -# first name, nickname, middle name, last name, name suffix -# birthday +# Updates our database using a deep parse of the bioguide. # options: -# --cache: load from cache if present on disk (default: true) -# --current: do *only* current legislators (default: true) -# --historical: do *only* historical legislators (default: false) -# --bioguide: do *only* a single legislator -# --relationships: Get familial relationships to other members of congress past and present, when applicable +# --cache: load bioguide from cache if present on disk (default: true) +# --bioguide X000000: do *only* a single legislator import lxml.html, io import datetime @@ -17,200 +12,93 @@ import utils from utils import download, load_data, save_data +from bioguide2 import parse_bioguide_entry + def run(): + # Testing? + if utils.flags().get('stdin'): + import sys, pprint + from bioguide2 import Elected + r = Elected.parser().parse_text(sys.stdin.read().strip(), matchtype='complete', eof=True) + print(r.string) + pprint.pprint(r.multi_info()[1]) + sys.exit(0) + + # Fetch the bioguide. Hits the network if the cache of the bioguide + # isn't present yet, or if --cache=False is set. + one_bioguide, bioguide_entries = download_the_bioguide() + + # Do a deep parse on the bioguide. + parse_the_bioguide(bioguide_entries) + + # Save result. + if not one_bioguide: + # Save a cached file if we aren't just parsing one record. + save_data(bioguide_entries, "bioguide-parsed.yaml") + else: + import rtyaml + print(one_bioguide) + print(rtyaml.dump(bioguide_entries[one_bioguide])) + - def update_birthday(bioguide, person, main): - - birthday = birthday_for(main) - if not birthday: - print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8"))) - warnings.append(bioguide) - return - if birthday == "UNKNOWN": - return - - try: - birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y") - except ValueError: - print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8"))) - warnings.append(bioguide) - return - - birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day) - person.setdefault("bio", {})["birthday"] = birthday - - - def birthday_for(string): - # exceptions for not-nicely-placed semicolons - string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April") - string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802") - string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967") - string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962") - string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947") - string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968") - - # look for a date - pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})" - match = re.search(pattern, string, re.I) - if not match or not match.group(1): - # specifically detect cases that we can't handle to avoid unnecessary warnings - if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN" - if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN" - return None - return match.group(1).strip() - - def relationships_of(string): - # relationship data is stored in a parenthetical immediately after the end of the tag in the bio - # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" - pattern = "^\((.*?)\)" - match = re.search(pattern, string, re.I) - - relationships = [] - - if match and len(match.groups()) > 0: - relationship_text = match.group(1).encode("ascii", "replace") - - # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar - from nltk import tree, pos_tag, RegexpParser - tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) - pos = pos_tag(tokens) - - grammar = r""" - NAME: {+} - NAMES: { (?:)* } - RELATIONSHIP: { + } - MATCH: { } - """ - cp = RegexpParser(grammar) - chunks = cp.parse(pos) - - # iterate through the Relationship/Names pairs - for n in chunks: - if isinstance(n, tree.Tree) and n.node == "MATCH": - people = [] - relationship = None - for piece in n: - if piece.node == "RELATIONSHIP": - relationship = " ".join([x[0] for x in piece]) - elif piece.node == "NAMES": - for name in [x for x in piece if isinstance(x, tree.Tree)]: - people.append(" ".join([x[0] for x in name])) - for person in people: - relationships.append({ "relation": relationship, "name": person}) - return relationships +def download_the_bioguide(): # default to caching cache = utils.flags().get('cache', True) force = not cache - # pick either current or historical - # order is important here, since current defaults to true - if utils.flags().get('historical', False): - filename = "legislators-historical.yaml" - elif utils.flags().get('current', True): - filename = "legislators-current.yaml" - else: - print("No legislators selected.") - exit(0) - - print("Loading %s..." % filename) - legislators = load_data(filename) - - - # reoriented cache to access by bioguide ID - by_bioguide = { } - for m in legislators: - if "bioguide" in m["id"]: - by_bioguide[m["id"]["bioguide"]] = m - - - # optionally focus on one legislator - - bioguide = utils.flags().get('bioguide', None) - if bioguide: - bioguides = [bioguide] - else: - bioguides = list(by_bioguide.keys()) - - warnings = [] - missing = [] - count = 0 - families = 0 - - for bioguide in bioguides: - # Download & parse the HTML of the bioguide page. - try: - dom = fetch_bioguide_page(bioguide, force) - except Exception as e: - print(e) - missing.append(bioguide) - continue - - # Extract the member's name and the biography paragraph (main). - - try: - name = dom.cssselect("p font")[0] - main = dom.cssselect("p")[0] - except IndexError: - print("[%s] Missing name or content!" % bioguide) - exit(0) - - name = name.text_content().strip() - main = main.text_content().strip().replace("\n", " ").replace("\r", " ") - main = re.sub("\s+", " ", main) - - # Extract the member's birthday. - - update_birthday(bioguide, by_bioguide[bioguide], main) - - # Extract relationships with other Members of Congress. - - if utils.flags().get("relationships", False): - #relationship information, if present, is in a parenthetical immediately after the name. - #should always be present if we passed the IndexError catch above - after_name = dom.cssselect("p font")[0].tail.strip() - relationships = relationships_of(after_name) - if len(relationships): - families = families + 1 - by_bioguide[bioguide]["family"] = relationships - - count = count + 1 - - - print() - if warnings: - print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings))) - - if missing: - print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing))) - - print("Saving data to %s..." % filename) - save_data(legislators, filename) - - print("Saved %d legislators to %s" % (count, filename)) - - if utils.flags().get("relationships", False): - print("Found family members for %d of those legislators" % families) - - # Some testing code to help isolate and fix issued: - # f - # none = "PEARSON, Joseph, a Representative from North Carolina; born in Rowan County, N.C., in 1776; completed preparatory studies; studied law; was admitted to the bar and commenced practice in Salisbury, N.C.; member of the State house of commons; elected as a Federalist to the Eleventh, Twelfth, and Thirteenth Congresses (March 4, 1809-March 3, 1815); while in Congress fought a duel with John George Jackson, of Virginia, and on the second fire wounded his opponent in the hip; died in Salisbury, N.C., October 27, 1834." - # print "Pearson (none): %s" % birthday_for(none) - - # owens = "OWENS, William, a Representative from New York; born in Brooklyn, Kings County, N.Y., January, 20, 1949; B.S., Manhattan College, Riverdale, N.Y., 1971; J.D., Fordham University, New York, N.Y., 1974; United States Air Force; lawyer, private practice; faculty, State University of New York, Plattsburgh, N.Y., 1978-1986; elected as a Democrat to the One Hundred Eleventh Congress, by special election to fill the vacancy caused by the resignation of United States Representative John McHugh, and reelected to the two succeeding Congresses (November 3, 2009-present)." - # print "Owens (January, 20, 1949): %s" % birthday_for(owens) - - # shea = "SHEA-PORTER, Carol, a Representative from New Hampshire; born in New York City, New York County, N.Y., December, 1952; graduated from Oyster River High School, Durham, N.H., 1971; B.A., University of New Hampshire, Durham, N.H., 1975; M.P.A., University of New Hampshire, Durham, N.H., 1979; social worker; professor; elected as a Democrat to the One Hundred Tenth Congress and to the succeeding Congress (January 3, 2007-January 3, 2011); unsuccessful candidate for reelection to the One Hundred Twelfth Congress in 2010; elected as a Democrat to the One Hundred Thirteenth Congress (January 3, 2013-present)." - # print "Shea (none): %s" % birthday_for(shea) - - # control = "PEARSON, Richmond, a Representative from North Carolina; born at Richmond Hill, Yadkin County, N.C., January 26, 1852; attended Horner's School, Oxford, N.C., and was graduated from Princeton College in 1872; studied law; was admitted to the bar in 1874; in the same year was appointed United States consul to Verviers and Liege, Belgium; resigned in 1877; member of the State house of representatives 1884-1886; elected as a Republican to the Fifty-fourth and Fifty-fifth Congresses (March 4, 1895-March 3, 1899); successfully contested the election of William T. Crawford to the Fifty-sixth Congress and served from May 10, 1900, to March 3, 1901; appointed by President Theodore Roosevelt as United States consul to Genoa, Italy, December 11, 1901, as Envoy Extraordinary and Minister Plenipotentiary to Persia in 1902, and as Minister to Greece and Montenegro in 1907; resigned from the diplomatic service in 1909; died at Richmond Hill, Asheville, N.C., September 12, 1923; interment in Riverside Cemetery." - # print "\nControl (January 26, 1852): %s" % birthday_for(control) + bioguide_entries = { } + for filename in ("legislators-historical.yaml", "legislators-current.yaml"): + print("Fetching bioguide entries for legislators in %s..." % filename) + legislators = load_data(filename) + + # reoriented cache to access by bioguide ID + by_bioguide = { } + for m in legislators: + if "bioguide" in m["id"]: + by_bioguide[m["id"]["bioguide"]] = m + + # optionally focus on one legislator + one_bioguide = utils.flags().get('bioguide', None) + if one_bioguide: + if one_bioguide not in by_bioguide: + continue + bioguides = [one_bioguide] + else: + bioguides = sorted(by_bioguide.keys()) + + # Download & parse the HTML of the bioguide pages. + for bioguide in bioguides: + try: + dom = fetch_bioguide_page(bioguide, force) + except Exception as e: + print(e) + continue + + # Extract the member's name and the biography paragraph. + try: + name = dom.cssselect("p font")[0] + biography = dom.cssselect("p")[0] + except IndexError: + print("[%s] Missing name or content!" % bioguide) + continue + + name = name.text_content().strip().rstrip(',') + biography = biography.text_content().strip().replace("\n", " ").replace("\r", " ") + biography = re.sub("\s+", " ", biography) + + bioguide_entries[bioguide] = { + "name": name, + "text": biography, + } + + return one_bioguide, bioguide_entries def fetch_bioguide_page(bioguide, force): url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide cache = "legislators/bioguide/%s.html" % bioguide try: - body = download(url, cache, force) + body = download(url, cache, force, options={ "log_downloads": True }) # Fix a problem? body = body.replace("Á\xc2\x81", "Á") @@ -232,5 +120,31 @@ def fetch_bioguide_page(bioguide, force): return dom +def parse_the_bioguide(bioguide_entries): + # Parse the bioguide entries using our modgrammar grammar. + # This part is slow and CPU-bound, so use a pool of workers. + + from multiprocessing import Pool + + with Pool() as pool: + # Queue up all of the tasks. + tasks = { } + for bioguide in sorted(bioguide_entries): + # Queue up a call to parse_bioguide_entry. This returns an + # AsyncResult which lets us check later if the call completed. + ar = pool.apply_async( + parse_bioguide_entry, + [bioguide_entries[bioguide]['name'], bioguide_entries[bioguide]['text']]) + tasks[bioguide] = ar + + # Wait for all of the tasks to complete and store the results + # in the main dict. + for bioguide, ar in sorted(tasks.items()): + print(bioguide, bioguide_entries[bioguide]['name'], '...') + parsed_info = ar.get() + bioguide_entries[bioguide].update(parsed_info) + + + if __name__ == '__main__': run() \ No newline at end of file diff --git a/scripts/bioguide2.py b/scripts/bioguide2.py new file mode 100755 index 000000000..57a595ba2 --- /dev/null +++ b/scripts/bioguide2.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python +# +# This module builds a modgrammar context free grammar & parser +# for the text of biographical entries for Members of Congress +# at http://bioguide.congress.gov. +# +############################################################### + +import datetime, copy +from modgrammar import * + +# Utilities.... + +def grammar_from_list(literals, titlecase_too=True): + # Turns a list of strings into a Grammar that accepts + # any of those strings, i.e.: + # grammar_from_list(["ABC", "xyz"]) + # == LITERAL("ABC") | LITERAL("XYZ") + g = None + for w in literals: + l = LITERAL(w) + if titlecase_too: + l |= LITERAL(w.title()) + if g is None: + g = l + else: + g |= l + return g + +MULTIWORD = WORD('-A-Za-z0-9’(),." ', greedy=True) +MULTIWORD_NOTGREEDY = WORD('-A-Za-z0-9’(),." ', greedy=False) + +################################################################ +# Build a grammar of cardinal (one, two, three, ...) and ordinal +# (first, second, third, ...) numbers, which we need for parsing +# strings like "One-hudred thirteen Congress". +################################################################ + +cardinal_numbers_1 = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', + 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'] +cardinal_numbers_10 = ["twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] + +ordinal_numbers_1 = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelf", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth"] +ordinal_numbers_10 = ["twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "eightieth", "ninetieth", "one hundredth"] + +class CardinalNumber(Grammar): + # Matches a cardinal number from "zero" to "nineteen". + grammar = grammar_from_list(cardinal_numbers_1) + def value(self): + # Turn the string into an integer by lookup into the cardinal_numbers_1 array. + return cardinal_numbers_1.index(self.string.lower()) + +class OrdinalNumber1(Grammar): + # Matches an ordinal number from "first" to "nineteenth" and two- + # word ordinal numbers like "twenty-first". + grammar = (OPTIONAL(G(grammar_from_list(cardinal_numbers_10), LITERAL('-'))), grammar_from_list(ordinal_numbers_1)) + def value(self): + # Turn the string into an integer by lookup into the cardinal_numbers_10 array. + x = ordinal_numbers_1.index(self[1].string.lower()) + 1 + if self[0]: + x += (cardinal_numbers_10.index(self[0][0].string.lower()) + 2) * 10 + return x + +class OrdinalNumber2(Grammar): + # Matches an ordinal number that is a multiple of ten from "twentieth" + # to "one hundredth". + grammar = grammar_from_list(ordinal_numbers_10) + def value(self): + # Turn the string into an integer by lookup into the ordinal_numbers_10 array. + return (ordinal_numbers_10.index(self[0].string.lower()) + 2) * 10 + +class OrdinalNumber3(Grammar): + # Matches an ordinal number above "one hundredth". + grammar = CardinalNumber, LITERAL(' hundred ') | LITERAL(' Hundred '), OrdinalNumber1 | OrdinalNumber2 + def value(self): + # Turn the string into an integer. + return 100 * self[0].value() + self[2].value() + +class OrdinalNumber(Grammar): + # Matches any ordinal number (via the grammars above). + grammar = OrdinalNumber1 | OrdinalNumber2 | OrdinalNumber3 + def value(self): + # Turn the string into a number by calling the value() method of + # whichever grammar rule matched. + return self[0].value() + +################################################################ +# Build a grammar of dates. +################################################################ + +month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] + +class Date(Grammar): + # Matches a date, or just a year alone. So matches: + # e.g. "Janary 1, 1950" or just "1950". + grammar = OPTIONAL(grammar_from_list(month_names), LITERAL(' '), WORD('0-9', min=1, max=2), LITERAL(', ')), G(L('1') | L('2'), WORD('0-9', min=3, max=3)) + def value(self): + # Return the parsed date. + if self[0]: + # If it's a full date, return a datetime.date instance. + return datetime.date(int(self[1].string), month_names.index(self[0][0].string)+1, int(self[0][2].string)) + else: + # If it's a year alone, return the integer for the year. + return int(self[1].string) + +class DateOptRange(Grammar): + # Match a date or a date range (e.g. "January 1, 1950-January 10, 1950"). + grammar = Date, OPTIONAL(LITERAL('-') | LITERAL('–'), Date | L('present')) + def value(self): + # Return the parsed date, or if it's a range then a dict with + # 'start' and 'end' keys. + if self[1] is None: + return self[0].value() + else: + return { "start": self[0].value(), "end": self[1][1].value() if isinstance(self[1][1], Date) else "present" } + +################################################################ +# Biographies begin with some parenthetical information about +# the person's name and relations to other Members of Congress, +# and a summary of the person's roles in Congress. +################################################################ + +class FamilyInfo(Grammar): + # Match relationships to other Members of Congress: + # e.g. "(grand-step-daughter-in-law of [name])" + grammar = ( + ZERO_OR_MORE(grammar_from_list(['grand-', 'grand', 'great, ', 'great ', 'great-', 'great', 'half ', 'half-', 'second ', 'step-', 'step'], titlecase_too=False)), + grammar_from_list(['relative', 'brother', 'cousin', 'daughter', 'father', 'husband', 'mother', 'nephew', 'niece', 'sister', 'son', 'uncle', 'wife', 'nephew and adopted son'], titlecase_too=False), + OPTIONAL(grammar_from_list(['-in-law'], titlecase_too=False)), + L(' of '), + LIST_OF(ANY_EXCEPT(';[]', greedy=False), sep=L(', ') | L(' and ') | L(', and '), greedy=False), + OPTIONAL(G(L(' ['), Date, L('-'), Date, L(']'))) + ) + def info(self): + # Returns the relation and the name of the other person as a dict. + ret = { + "relation": self[0].string + self[1].string + (self[2].string if self[2] else ""), + "to": { "name": self[4].string }, + } + if self[5]: + ret["to"]["born"] = self[5][1].value() + ret["to"]["died"] = self[5][3].value() + return ret + +class NameInfo(Grammar): + # Match some other name information in the initial parenthesis, e.g. + # e.g. "(elected under the name [name])" + grammar = ( + grammar_from_list(["elected under the name", "served under the name of", "formerly", "later", "subsequently", "original name,", "after election married", "fomerly married to", "formerly married to"], titlecase_too=False), + L(" "), + WORD('-A-Za-z0-9’,. '), # no parenthesis or semicolon so we can be greedy + ) + def info(self): + return { + "type": self[0].string, + "name": self[2].string + } + +class ParentheticalInfo(Grammar): + # Match the parenthetical information at the start of a biography, + # which is a list of FamilyInfo and NameInfo phrases. + grammar = L("("), LIST_OF(FamilyInfo | NameInfo, sep=L('; ') | L(', ') | L(' and ') | L(', and ')), L("), ") + def info(self): + # Returns the parsed information by calling the info() methods + # of the parsed phrases. + return { + "family-relations": [x.info() for x in self[1] if isinstance(x, FamilyInfo)], + "name-info": [x.info() for x in self[1] if isinstance(x, NameInfo)], + } + +state_to_abbr = { "Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Dakota": "DK", "Delaware": "DE", "the District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Philippines Territory/Commonwealth": "PI", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Orleans": "OL", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "the Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY" } + +class StateName(Grammar): + # Matches any state name, including 'the Territory of ' + various state names + # for the period before they were states. + grammar = ( + OPTIONAL(L("the Territory of ")), + grammar_from_list(state_to_abbr.keys()) + ) + def value(self): + return state_to_abbr[self[1].string] + +class CongressRole(Grammar): + # Matches e.g. "a Representative and a Senator from New York". + grammar = (LIST_OF( + GRAMMAR( + L("a "), + L("Representative") | L("Senator") | L("Resident Commissioner") | L('Delegate'), + name="type"), + sep=L(", ") | L(" and ") | L(", and ")), + L(" from "), + StateName) + def info(self): + # Returns the parsed information. Since there may be multiple + # roles that share a state name, break them apart into role-state + # pairs and return a dict. + return [{ + "type": x[1].string, + "state": self[2].value(), + } for x in self[0] if x.grammar_name == "type"] + +class VPRole(Grammar): + # Matches "Vice President of the United States". + grammar = L("Vice President of the United States") + def info(self): + return [{ "type": self.string }] + +class PresidentRole(Grammar): + # Matches e.g. "5th President of the United States". + grammar = WORD("0-9"), WORD("a-z"), L(" President of the United States") + def info(self): + return [{ + "type": self[2].string.strip(), + "ordinal": int(self[0].string), + }] + +class RoleSummaryInfo(Grammar): + # Match the full role summary information at the start of a biography + # entry, which is a list of CongressRoles or vice-president/president + # roles. + grammar = LIST_OF(CongressRole | VPRole | PresidentRole, sep=' and ') + def info(self): + # Returns the full list of roles by concatenating all of the + # roles returned by the info() methods of the matched grammars. + return sum([x.info() for x in self[0] if isinstance(x, (CongressRole, VPRole, PresidentRole))], []) + +################################################################ +# Grammars for "born in", "died on", and the lines about degrees +# held parts of the biography. +################################################################ + +class BornIn(Grammar): + # Match "born [birthname] in/near [city], [date]". + grammar = (LITERAL("born "), OPTIONAL(WORD('A-Za-z,. ', greedy=False), L(" ")), L("in ") | L("near ") | L("at "), WORD('A-Za-z,. ', greedy=False), LITERAL(', ') | LITERAL(', in '), Date) + def info(self): + # Returns a dict with the parsed information. + ret = { "born": { + "location": self[3].string, + "date": self[5].value(), + } } + if self[1]: + ret["birth-name"] = self[1][0].string + return ret + +class Died1(Grammar): + # Match "died on [date], in [city]" + grammar = (L("died "), OPTIONAL(L("on ")), Date, OPTIONAL(L(', in '), MULTIWORD)) + def info(self): + # Returns a dict with the parsed information. + return { "died": { + "location": self[3][1].string if self[3] else None, + "date": self[2].value() + } } + +class Died2(Grammar): + # Match "died in [city] on [date]" + grammar = (L("died in "), MULTIWORD_NOTGREEDY, OPTIONAL(L(" on ") | L(", "), Date)) + def info(self): + # Returns a dict with the parsed information. + return { "died": { + "location": self[1].string if self[1] else None, + "date": self[2][1].value() if self[2] else None, + } } + +class Died(Grammar): + # Match either of the forms of the death sentence. + grammar = Died1 | Died2 + def info(self): + # Returns a dict with the parsed information by calling + # the info() method of whichever grammar matched. + return self[0].info() + +class Degree(Grammar): + grammar = grammar_from_list(['LL.B.', 'LL.D.', 'J.D.', 'B.A.', 'M.A.', 'Ph.D.', 'D.V.M.', 'M.D.']), LITERAL(', '), MULTIWORD + def multi_info(self): return ("degrees", { "degree": self[0].string, "institution": self[2].string }) + +################################################################ +# Grammars for parsing the really important part of this: the +# parts describing when and how a person was elected. +################################################################ + +class ElectedFromState(Grammar): + # Matches "from [state]. + grammar = L('from '), StateName + def info(self): + return { "state": self[1].value() } + +class ToFillTheVacancy(Grammar): + # Matches "to fill the vacancy [in the term ending [date]] caused by the death of....". + grammar = ( + OPTIONAL(L('by special election ') | L('in a special election ')), + LITERAL('to fill the vacancy '), + OPTIONAL(LITERAL('in the term ending '), Date, LITERAL(', ')), + LITERAL('caused by '), + MULTIWORD_NOTGREEDY + ) + def info(self): + return { "fill-vacancy": { + "term-ending": self[2][1].value() if self[2] else None, + "reason": self[4].string } } + +class DidNotAssumeOffice(Grammar): + grammar = L('but did not assume office until '), Date + def info(self): + return { "did-not-assume-office-until": self[1].value() } + +class ReelectedSucceedingCongresses(Grammar): + # Matches "relected to the seven succeeding Congresses", which is + # used for people elected to consecutive House terms. + grammar = OPTIONAL(LITERAL("reelected ")), LITERAL("to the "), CardinalNumber, LITERAL(" succeeding Congresses") + def value(self): + return self[2].value() + +class ReelectedInYear(Grammar): + # Matches "relected in [year], [year], ...", which is + # used for people elected to consecutive Senate terms. + grammar = LITERAL("reelected in "), LIST_OF(Date, sep=L(', ') | L(' and ') | L(', and ')) + def value(self): + return [x.value() for x in self[1] if isinstance(x, Date)] + +Party = grammar_from_list(['Adams', 'Adams Republican', 'Adams-Clay Federalist', 'Adams-Clay Republican', 'Alliance', 'American', 'American (Know-Nothing)', 'American Laborite', 'American Party', 'Anti Jacksonian', 'Anti-Administration', 'Anti-Democrat', 'Anti-Jacksonian', 'Anti-Lecompton Democrat', 'Anti-Masonic', 'Anti-Monopolist', 'Anti-administration', 'Coalitionist', 'Conservative', 'Conservative Republican', 'Constitutional Unionist', 'Crawford Federalist', 'Crawford Republican', 'Crawford Republicans', 'Democrat', 'Democrat Farmer Labor', 'Democrat-Farm Labor', 'Democrat-Liberal', 'Democrat/Independent', 'Democrat/Jacksonian', 'Democrat/Republican', 'Democrat;Republican', 'DemocratI', 'Democratic', 'Democratic Republican', 'Democratic and Union Labor', 'Farmer Laborite', 'Federalist', 'Free Silver', 'Free Soil', 'Free Soilier', 'Greenbacker', 'Home Rule', 'Independence Party (Minnesota)', 'Independent', 'Independent Democrat', 'Independent Republican', 'Independent Whig', 'Independent/Democrat', 'Independent/Republican', 'Jackson', 'Jackson Democrat', 'Jackson Democrat', 'Jackson Federalist', 'Jackson Republican', 'Jacksonian', 'Jacksonian Republican', 'Labor', 'Law and Order', 'Liberal', 'Liberal Republican', 'Liberty', 'NA', 'National', 'Nationalist', 'New Progressive', 'Nonpartisan', 'Nullifier', 'Opposition', 'Opposition Party', 'Oppositionist Party', 'PARTY', 'Popular Democrat', 'Populist', 'Pro-Administration', 'Pro-administration', 'Progressive', 'Progressive Republican', 'Prohibitionist', 'Readjuster', 'Representative', 'Republican', 'Republican\t', 'Republican-Conservative', 'Republican/Democrat', 'Republican; Independent', 'RepublicanCap', 'Silver', 'Silver Republican', 'Socialist', 'State Rights Democrat', 'States Rights', 'Unconditional Unionist', 'Union', 'Union Labor', 'Union Republican', 'Unionist', 'Unknown', 'Van Buren Democrat', 'Whig']) + +class BecomingParty(Grammar): + # Matches "relected in [year], [year], ...", which is + # used for people elected to consecutive Senate terms. + grammar = LITERAL("becoming a "), Party, LITERAL(" in "), Date + def info(self): + return { "changed-party": { "party": self[1].string, "date": self[3].value() } } + +class ElectionDetail(Grammar): + # Various election details. + grammar = ElectedFromState | ReelectedSucceedingCongresses | ReelectedInYear | ToFillTheVacancy \ + | DidNotAssumeOffice | BecomingParty + grammar_collapse = True + +class HouseElection1(Grammar): + # Matches "to the Fiftieth Congress", which is used when a person + # is elected to the House for either one or three or more consecutive + # terms. + grammar = ( + LITERAL('to the '), + OPTIONAL(LITERAL('U.S. House of Representatives for the ')), + OrdinalNumber, + OPTIONAL(LITERAL(' Congress')), # missing when grouped with "and relected to the X succeeding Congresses" + ) + def info(self): + return { + "type": "house", + "congresses": [self[2].value()], + } + +class HouseElection2(Grammar): + # Matches "to the Fiftieth and Fifty-first Congresses". + grammar = ( + LITERAL('to the '), + LIST_OF(OrdinalNumber, sep=L(', ') | L(' and ') | L(', and ')), + LITERAL(' Congresses'), + ) + def info(self): + return { + "type": "house", + "congresses": [x.value() for x in self[1] if isinstance(x, OrdinalNumber)], + } + +class SenateElection(Grammar): + # Matches "to the United States Senate [in 1990]". + grammar = ( + L('to the United States Senate'), + OPTIONAL(L(' in ') | L(', '), Date) + ) + def info(self): + ret = { + "type": "senate", + } + if self[1]: + ret["date"] = self[1][1].value() + return ret + +class Election(Grammar): + # Matches "elected on [date] as a [party name] ..... [election details". + grammar = ( + LITERAL('elected ') | LITERAL('reelected ') | LITERAL('successfully contested ') | LITERAL('appointed ') | LITERAL('appointed and subsequently elected '), + OPTIONAL(LITERAL('on ') | LITERAL('in '), Date, LITERAL(', ') | LITERAL(' ')), + OPTIONAL(LITERAL('as a '), Party, LITERAL(' ')), + OPTIONAL(LITERAL('(later '), REPEAT(ANY), L(') ')), + OPTIONAL(G('the election of ', MULTIWORD_NOTGREEDY)), + HouseElection1 | HouseElection2 | SenateElection, + ZERO_OR_MORE(G(LITERAL(', ') | LITERAL(', and ') | LITERAL(' and ') | LITERAL(' '), + ElectionDetail)), + ) + def info(self): + ret = [] + + # Election info. + el = self[5].info() + el.update({ + "party": self[2][1].string if self[2] else None, + "how": self[0].string.strip(), + }) + if self[1]: + el["date"] = self[1][1].value() + if self[3]: + el["party-later"] = self[3][1].string + if self[4]: + el["contested"] = self[4].string + + # Multiple House elections are specified at once. + if el.get("congresses"): + for c in el["congresses"]: + el2 = dict(el) # clone + del el2["congresses"] + el2["congress"] = c + ret.append(el2) + else: + ret.append(el) + + # Reelections. + for item in self[6]: + item = item[1] + if isinstance(item, ReelectedSucceedingCongresses): + for x in range(1, 1+item.value()): + el2 = { + "how": "reelected", + "type": el["type"], + "congress": el["congresses"][0] + x, + } + ret.append(el2) + elif isinstance(item, ReelectedInYear): + for x in item.value(): + el2 = { + "how": "reelected", + "type": el["type"], + "date": x, + } + ret.append(el2) + else: + # Update the first election info. + ret[0].update(item.info()) + + return ret + +class ElectionsDateRange1(Grammar): + # Matches "([date]-[date])". + grammar = LITERAL(' ('), DateOptRange, LITERAL(')') + def value(self): + return self[1].value() + +class ElectionsDateRange2(Grammar): + # Matches ", and served [ from [date] ] to [date] / until his/her resignation on [date]" + grammar = ( + OPTIONAL(','), + OPTIONAL( + LITERAL(' and served from ') | LITERAL('; served from '), + Date, + OPTIONAL(LITERAL(',')), + ), + OPTIONAL( + LITERAL(' to ') | LITERAL(' until ') | LITERAL(' until her resignation on ') | LITERAL(' until his resignation on '), + Date, + OPTIONAL(LITERAL(', when he resigned'), MULTIWORD_NOTGREEDY) + ), + OPTIONAL(LITERAL(' until his death')), + ) + def value(self): + ret = { + "start": self[1][1].value() if self[1] else None, + "end": self[2][1].value() if self[2] else None, + } + if self[2] and "resignation" in self[2].string: + ret["end-reason"] = "resignation" + elif self[2] and self[2][2]: + ret["end-reason"] = "resignation" + elif self[3]: + ret["end-reason"] = "death" + return ret + + +class Elected(Grammar): + # Main grammar for matching an "elected ..." part of the biographical + # entry. This phrase starts with "elected" and ends with a date range. + # Within the date range there may be multiple elections. + grammar = ( + OPTIONAL( + GRAMMAR(L("upon the readmission of the State of "), StateName, OPTIONAL(L(" to repreesntation")), L(" was ")) + | GRAMMAR(L("upon the admission of "), StateName, L(" as a State into the Union, was ")) + ), + LIST_OF(Election, sep=LITERAL(" and ") | LITERAL('; ')), + ElectionsDateRange1 | ElectionsDateRange2, + OPTIONAL(ElectionsDateRange1) + ) + def multi_info(self): + ret = { + "elections": sum([e.info() for e in self[1] if isinstance(e, Election)], []), + "dates": self[2].value(), + } + if self[3]: + # Sometimes both types of date ranges are provided, e.g.: + # "until his resignation on February 28, 2010 (January 3, 1991-February 28, 2010)" + # In that case, replace values where they are redundant. + ret['dates'].update(self[3].value()) + return ("elected", ret) + +################################################################ +# Fall-back grammar for parsing all other activity lines. +################################################################ + +class Activity(Grammar): + grammar = ( + REPEAT(ANY, greedy=False), + OPTIONAL(LITERAL(', ') | LITERAL(' in ') | LITERAL(' '), DateOptRange) + ) + def multi_info(self): + ret = { + "text": self[0].string, + } + if self[1]: + ret["date"] = self[1][1].value() + return ("activities", ret) + +################################################################ +# Match any single phrase in the biography separated by +# semicolons. +################################################################ + +class BiographyEntry(Grammar): + # Activity goes last because it catches anything that isn't + # picked up by one of the other grammars. This works because + # the grammars are left-to-right greedy. + grammar = BornIn | Died | Degree | Elected | Activity + grammar_collapse = True + +################################################################ +# Match a whole biography. +################################################################ + +class Biography(Grammar): + grammar = LIST_OF(BiographyEntry, sep='; ') + def info(self): + info = { } + for r in self[0]: + if hasattr(r, 'info'): + info.update(r.info()) + elif hasattr(r, 'multi_info'): + key, value = r.multi_info() + info.setdefault(key, []).append(value) + return info + +################################################################ +# Main function for parsing a bioguide entry. +################################################################ + +parser = Biography.parser() + +def parse_bioguide_entry(name, biography): + # strip the name from the biography + biography = biography[len(name)+1:].strip() + + # The parser is super slow when we make it complex. So + # we handle the initial parts of the biography in pieces. + info = { } + if biography.startswith("("): + # There is some parenthesized content first. Parse + # it and strip it out. + try: + r = ParentheticalInfo.parser().parse_text(biography) + except ParseError as e: + return { "_parse_error": str(e) } + biography = biography[len(r.string):] + info.update(r.info()) + + # The next part is a summary of the roles this person held. + # Parse that and then strip it off. + try: + r = RoleSummaryInfo.parser().parse_text(biography) + info['roles'] = r.info() + biography = biography[len(r.string):] + except ParseError as e: + return { "_parse_error": str(e) } + + # The rest of the biography is a ;-delimited list of biography pieces. + biography = biography.rstrip('.') # biography always ends in a period + try: + r = parser.parse_text(biography, reset=True, matchtype='complete', eof=True) + except ParseError as e: + return { "_parse_error": str(e) } + info.update(r.info()) + + for r in info.get('elected', []): + if isinstance(r['dates'], dict) and not r['dates']['end'] and r['dates'].get('end-reason') == 'death' and info.get('died'): + r['dates']['end'] = copy.deepcopy(info['died']['date']) # cloning the date prevents wierd YAML object references in output + + + return info diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 353ac3896..498b7dddb 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -6,3 +6,4 @@ lxml>=2.2 cssselect pyflakes pytz +modgrammar diff --git a/scripts/utils.py b/scripts/utils.py index 0c38b4f08..5cb637d6c 100755 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -189,7 +189,7 @@ def download(url, destination=None, force=False, options=None): body = f.read() else: try: - if options.get('debug', False): + if options.get('debug', False) or options.get('log_downloads', False): log("Downloading: %s" % url) if options.get('urllib', False):