Source code for finds.readers.ffreader

"""Wraps around pandas_datareader to retrieve French data library

MIT License

Copyright 2022-2023 Terence Lim
"""
import pandas_datareader as pdr
import numpy as np
import pandas as pd
import io
import zipfile
from pandas import DataFrame, Series
from typing import Callable, List
import requests

[docs]class FFReader:
    """Wraps over pandas_datareader to load FamaFrench factors from website

    Hints for using pandas datareader to read a single series
    ::

       from pandas_datareader.famafrench import FamaFrenchReader as FFR
       mkt = FFReader('F-F_Research_Data_Factors', start=1900, end=2099).read()
    """
    def __init__(self, symbol):
        """Read data for symbol into instance"""
        self.data = pdr.data.FamaFrenchReader(symbol, start=pd.to_datetime('1900-01-01')).read()
        self.descr = self.data['DESCR']

    def __len__(self):
        return len(self.data) - 1
    
[docs]    def __getitem__(self, key):
        return self.data[key]

    def __repr__(self):
        return self.descr

    def __str__(self):
        return self.descr

    daily = [
        ('F-F_Research_Data_5_Factors_2x3_daily', 0, ''),
        ('F-F_Research_Data_Factors_daily', 0, ''),
        ('F-F_Momentum_Factor_daily', 0, ''),
        ('F-F_LT_Reversal_Factor_daily', 0, ''),
        ('F-F_ST_Reversal_Factor_daily', 0, ''),
        ('49_Industry_Portfolios_daily', 0, '49vw'), # append suffix
        ('48_Industry_Portfolios_daily', 0, '48vw'), #  to differentiate
        ('49_Industry_Portfolios_daily', 1, '49ew'), #  value-weighted vs
        ('48_Industry_Portfolios_daily', 1, '48ew'), #  equal-weighted
    ]
    """Common daily FF series, with subset index and a suggested suffix"""

    
    monthly = [
        ('F-F_Research_Data_5_Factors_2x3', 0, '(mo)'),
        ('F-F_Research_Data_Factors', 0, '(mo)'),   # "(mo)" for monthly
        ('F-F_Momentum_Factor', 0, '(mo)'),
        ('F-F_LT_Reversal_Factor', 0, '(mo)'),
        ('F-F_ST_Reversal_Factor', 0, '(mo)'),
    ]
    """Common monthly FF series, with subset index and a suggested suffix"""


[docs]    @staticmethod
    def sectoring(scheme: str, source: str = "") -> DataFrame | None:
        """Load FamaFrench sectoring based on sic-4, from website or zipfile

        Args:
          scheme: in {codes5, codes10, 12, 17, 30, 38, 48, 49}

        Notes:

        Retrieved from "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/"
        + "ftp/Siccodes5.zip"

        For example, the industry definitions file for Siccodes49 looks like:

        ::

            1 Agric  Agriculture
            0100-0199 Agricultural production - crops
            0200-0299 Agricultural production - livestock
            0700-0799 Agricultural services
            0910-0919 Commercial fishing
            2048-2048 Prepared feeds for animals
            2 Food   Food Products
            2000-2009 Food and kindred products
            2010-2019 Meat products
            2020-2029 Dairy products
        """
        # if no source url provided, use Dartmouth website base URL
        if not source:
            prefix_ = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/"
            source = prefix_ + f"ftp/Sic{scheme}.zip"

        # Handle cases where source is a base URL or a local zip file
        if source.startswith('http'):
            response = requests.get(source)
            f = io.BytesIO(response.content)
            subfile = 'Sic' + scheme + '.txt'
            open_ = zipfile.ZipFile(f).open
        else:
            if source.endswith('.zip'):            
                open_ = zipfile.ZipFile(source).open
                subfile = 'Sic' + scheme + '.txt'
            else:
                open_ = open
                subfile = source
        labels = DataFrame(columns=['name','description','end'])
        
        with open_(subfile) as f:   # open text subfile from zip archive
            for line in f:
                items = line.decode('utf-8').rstrip('\n').split()
                if len(items) >= 1:
                    sic = items[0].split('-')
                    if (len(sic) == 2):      # "-" separates two sic codes
                        labels.loc[int(sic[0]),'name'] = ind   # append a row
                        labels.loc[int(sic[0]),'description'] = desc
                        labels.loc[int(sic[0]),'end'] = int(sic[1])
                        #_print(sic[0], labels.loc[int(sic[0])].values)
                    else:
                        if len(items) <= 1:
                            ind = '???'  
                        else:
                            ind = items[1]   # else is name and description
                            desc = " ".join(items[2:])
                            if ind == 'Other':
                                other = desc # "Other" often lacks description
                                
        # handle case if last sector is "Other" with no sic's:
        #   assign next sic2 not in table to be an "Other" sector
        next_sic2 = (((labels.end // 100) + 1) * 100).astype(int)
        df = DataFrame(columns = labels.columns)
        df.loc[0, ['name', 'description']] = ['Other', other]
        df.loc[max(next_sic2) ,['name', 'description']] = ['Other', other]

        if len(np.unique(labels.name)) < int(scheme[5:]):  # "Other" has no sics
            for i in range(len(labels)-1):
                if (next_sic2.iloc[i] < labels.index[i+1]
                    and next_sic2.iloc[i] not in labels.index):
                    df.loc[next_sic2.iloc[i], 'name'] = 'Other'
                    df.loc[next_sic2.iloc[i], 'description'] = other

        # return as dataframe
        sectors = pd.concat([labels, df], axis=0)\
                    .drop(columns=['end'])\
                    .sort_index()
        return sectors
    
[docs]    @staticmethod
    def keys() -> List[str]:
        """Return names of all available datasets"""
        return pdr.data.FamaFrenchReader(None).get_available_datasets()

[docs]    @staticmethod
    def fetch(name: str, item: int = 0, suffix: str = '',
              date_formatter: Callable = lambda x: x) -> DataFrame:
        """Retrieve item and return as DataFrame

        Args:
          name: Name of research factor in Ken French website
          item: Index of item to research (e.g. 0 is usually value-weighted)
          suffix: Suffix to append to name (e.g. to distinguish monthly from daily)
          date_formatter: to reformat dates, e.g. bd.offset or bd.endmo

        Returns:
          DataFrame of asset returns (converted to decimal, not percentages) 
        """
        df = pdr.data.FamaFrenchReader(name, start=pd.to_datetime('1900-01-01')).read()
        df = df[item]
        try:
            df.index = df.index.to_timestamp()
        except:
            pass       # invalid comparison error
        df.index = [date_formatter(d) for d in df.index]
        df.columns = [c.strip() + suffix for c in df.columns]
        df.where(df > -99.99, other=np.nan, inplace=True)  # replace NaNs
        df = df / 100   # change percentage returns in source to decimals
        return df

if __name__ == "__main__":
    print(Series(FFReader.keys()).to_string())
    df = FFReader.sectoring('codes5')