"""Wraps around pandas_datareader to retrieve French data library
MIT License
Copyright 2022-2023 Terence Lim
"""
import pandas_datareader as pdr
import numpy as np
import pandas as pd
import io
import zipfile
from pandas import DataFrame, Series
from typing import Callable, List
import requests
[docs]class FFReader:
"""Wraps over pandas_datareader to load FamaFrench factors from website
Hints for using pandas datareader to read a single series
::
from pandas_datareader.famafrench import FamaFrenchReader as FFR
mkt = FFReader('F-F_Research_Data_Factors', start=1900, end=2099).read()
"""
def __init__(self, symbol):
"""Read data for symbol into instance"""
self.data = pdr.data.FamaFrenchReader(symbol, start=pd.to_datetime('1900-01-01')).read()
self.descr = self.data['DESCR']
def __len__(self):
return len(self.data) - 1
[docs] def __getitem__(self, key):
return self.data[key]
def __repr__(self):
return self.descr
def __str__(self):
return self.descr
daily = [
('F-F_Research_Data_5_Factors_2x3_daily', 0, ''),
('F-F_Research_Data_Factors_daily', 0, ''),
('F-F_Momentum_Factor_daily', 0, ''),
('F-F_LT_Reversal_Factor_daily', 0, ''),
('F-F_ST_Reversal_Factor_daily', 0, ''),
('49_Industry_Portfolios_daily', 0, '49vw'), # append suffix
('48_Industry_Portfolios_daily', 0, '48vw'), # to differentiate
('49_Industry_Portfolios_daily', 1, '49ew'), # value-weighted vs
('48_Industry_Portfolios_daily', 1, '48ew'), # equal-weighted
]
"""Common daily FF series, with subset index and a suggested suffix"""
monthly = [
('F-F_Research_Data_5_Factors_2x3', 0, '(mo)'),
('F-F_Research_Data_Factors', 0, '(mo)'), # "(mo)" for monthly
('F-F_Momentum_Factor', 0, '(mo)'),
('F-F_LT_Reversal_Factor', 0, '(mo)'),
('F-F_ST_Reversal_Factor', 0, '(mo)'),
]
"""Common monthly FF series, with subset index and a suggested suffix"""
[docs] @staticmethod
def sectoring(scheme: str, source: str = "") -> DataFrame | None:
"""Load FamaFrench sectoring based on sic-4, from website or zipfile
Args:
scheme: in {codes5, codes10, 12, 17, 30, 38, 48, 49}
Notes:
Retrieved from "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/"
+ "ftp/Siccodes5.zip"
For example, the industry definitions file for Siccodes49 looks like:
::
1 Agric Agriculture
0100-0199 Agricultural production - crops
0200-0299 Agricultural production - livestock
0700-0799 Agricultural services
0910-0919 Commercial fishing
2048-2048 Prepared feeds for animals
2 Food Food Products
2000-2009 Food and kindred products
2010-2019 Meat products
2020-2029 Dairy products
"""
# if no source url provided, use Dartmouth website base URL
if not source:
prefix_ = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/"
source = prefix_ + f"ftp/Sic{scheme}.zip"
# Handle cases where source is a base URL or a local zip file
if source.startswith('http'):
response = requests.get(source)
f = io.BytesIO(response.content)
subfile = 'Sic' + scheme + '.txt'
open_ = zipfile.ZipFile(f).open
else:
if source.endswith('.zip'):
open_ = zipfile.ZipFile(source).open
subfile = 'Sic' + scheme + '.txt'
else:
open_ = open
subfile = source
labels = DataFrame(columns=['name','description','end'])
with open_(subfile) as f: # open text subfile from zip archive
for line in f:
items = line.decode('utf-8').rstrip('\n').split()
if len(items) >= 1:
sic = items[0].split('-')
if (len(sic) == 2): # "-" separates two sic codes
labels.loc[int(sic[0]),'name'] = ind # append a row
labels.loc[int(sic[0]),'description'] = desc
labels.loc[int(sic[0]),'end'] = int(sic[1])
#_print(sic[0], labels.loc[int(sic[0])].values)
else:
if len(items) <= 1:
ind = '???'
else:
ind = items[1] # else is name and description
desc = " ".join(items[2:])
if ind == 'Other':
other = desc # "Other" often lacks description
# handle case if last sector is "Other" with no sic's:
# assign next sic2 not in table to be an "Other" sector
next_sic2 = (((labels.end // 100) + 1) * 100).astype(int)
df = DataFrame(columns = labels.columns)
df.loc[0, ['name', 'description']] = ['Other', other]
df.loc[max(next_sic2) ,['name', 'description']] = ['Other', other]
if len(np.unique(labels.name)) < int(scheme[5:]): # "Other" has no sics
for i in range(len(labels)-1):
if (next_sic2.iloc[i] < labels.index[i+1]
and next_sic2.iloc[i] not in labels.index):
df.loc[next_sic2.iloc[i], 'name'] = 'Other'
df.loc[next_sic2.iloc[i], 'description'] = other
# return as dataframe
sectors = pd.concat([labels, df], axis=0)\
.drop(columns=['end'])\
.sort_index()
return sectors
[docs] @staticmethod
def keys() -> List[str]:
"""Return names of all available datasets"""
return pdr.data.FamaFrenchReader(None).get_available_datasets()
[docs] @staticmethod
def fetch(name: str, item: int = 0, suffix: str = '',
date_formatter: Callable = lambda x: x) -> DataFrame:
"""Retrieve item and return as DataFrame
Args:
name: Name of research factor in Ken French website
item: Index of item to research (e.g. 0 is usually value-weighted)
suffix: Suffix to append to name (e.g. to distinguish monthly from daily)
date_formatter: to reformat dates, e.g. bd.offset or bd.endmo
Returns:
DataFrame of asset returns (converted to decimal, not percentages)
"""
df = pdr.data.FamaFrenchReader(name, start=pd.to_datetime('1900-01-01')).read()
df = df[item]
try:
df.index = df.index.to_timestamp()
except:
pass # invalid comparison error
df.index = [date_formatter(d) for d in df.index]
df.columns = [c.strip() + suffix for c in df.columns]
df.where(df > -99.99, other=np.nan, inplace=True) # replace NaNs
df = df / 100 # change percentage returns in source to decimals
return df
if __name__ == "__main__":
print(Series(FFReader.keys()).to_string())
df = FFReader.sectoring('codes5')