Source code for finds.readers.fomcreader

"""Retrieves FOMC meeting minutes

MIT License

Copyright 2022-2023 Terence Lim
"""
import requests
import re
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from pandas import DataFrame, Series
from bs4 import BeautifulSoup
from typing import Dict

[docs]class FOMCReader: """Class to retrieve FOMC minutes""" _url = 'https://www.federalreserve.gov/' # root url def __init__(self, url: str = _url, delay: float = 0.1): """Initializer retrieves dates available from website Args: url: root url of Federal Reserve website delay: sleep between requests """ def dateOf(s): """parse date from link string""" return int(re.sub('\D', '', s)[-8:]) # latest five years' minutes can be found from a main page new_url = url + 'monetarypolicy/fomccalendars.htm' raw = BeautifulSoup(markup=requests.get(new_url).content, features='html.parser') hrefs = raw.find_all(name='a', href=re.compile('\S+minutes\S+.htm$', re.I)) links = [url + m.attrs['href'] for m in hrefs] # earlier years' minutes are linked from annual pages with this format old_url = url + 'monetarypolicy/fomchistorical%d.htm' for year in tqdm(range(1993, min([dateOf(m) for m in links]) // 10000)): raw = BeautifulSoup(markup=requests.get(old_url % year).content, features='html.parser') hrefs = raw.find_all(name='a', href=re.compile('\S+minutes\S+.htm$', re.I)) links += [url + m.attrs['href'].replace(url,'') for m in hrefs] time.sleep(delay) self.dates = {dateOf(link) : link for link in links} def __len__(self): return len(self.dates) def __iter__(self): return iter(self.dates)
[docs] def __getitem__(self, date) -> str: """Retrieve FOMC minutes text from Fed website Args: date: meeting date Returns: text of minutes for meeting date """ url = self.dates[date] raw = BeautifulSoup(markup=requests.get(url).content, features='html.parser') minutes = "\n\n".join([p.get_text().strip() for p in raw.findAll('p')]) return re.sub('\n+','\n', re.sub('[\r\t]',' ', minutes))
if __name__ == "__main__": from finds.database import MongoDB from finds.unstructured import Unstructured from finds.utils import Store from secret import credentials, paths VERBOSE = 1 mongodb = MongoDB(**credentials['mongodb'], verbose=VERBOSE) print('uptime:', mongodb.client.admin.command("serverStatus")['uptime']) fomc = Unstructured(mongodb, 'FOMC') # retrieve keys (dates) of minutes previously retrieved and stored locally dates = fomc['minutes'].distinct('date') # fetch new minutes from FOMC site docs = {d: minutes[d] for d in minutes if d not in dates} print("New minutes:") pprint([f"{k}: {len(v)} chars" for k,v in docs.items()]) def edit(text: str) -> str: """helper to spawn editor and write/edit/read to tempfile""" import subprocess import tempfile with tempfile.NamedTemporaryFile(suffix=".tmp") as f: # save temp file f.write(text.encode("utf-8")) f.flush() subprocess.call([os.environ.get('EDITOR','emacs'), "-nw", f.name]) f.seek(0) return f.read().decode("utf-8") # keep edited text if docs: # to edit out head and tail of each document results = list() for date, initial_message in docs.items(): edited_text = edit(initial_message) results.append({'date': date, 'text' : edited_text}) results = sorted(results, key = lambda x: x['date']) # sort by date # save edited docs Store(paths['scratch'] / 'fomc', ext='gz').dump(results, f"{max(docs.keys())}.json") for doc in results: # store docs for new dates fomc.insert('minutes', doc, keys=['date'])