"""Signals implements structured dataset interface for derived signal values
Copyright 2022, Terence Lim
MIT License
"""
from typing import Any, Dict, List, Tuple
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from pandas.api.types import is_list_like, is_integer_dtype
from sqlalchemy import Table, Column, Index, Integer, Float
from finds.database.sql import SQL, as_dtypes
from .stocks import Stocks
_VERBOSE = 0
[docs]class Signals(Stocks):
"""Provide structured stocks data interface to derived signal values
Args:
sql: Connection to SQL database
"""
def __init__(self, sql: SQL, verbose=_VERBOSE):
"""Initalize a connection to derived Signals values datasets"""
super().__init__(sql=sql, bd= None, tables={}, identifier='permno',
name='signals', verbose=verbose)
[docs] def __call__(self,
label: str,
date: int,
start: int = -1,
rebaldate: str = 'rebaldate') -> DataFrame:
"""Return cross-section of signal values available as of a date
Args:
label: Name of signal to retrieve
date : As of this date back through (non-inclusive) start date
start : Non-inclusive start of date range; -1 means exact date
rebaldate: Name of rebalance date column
Returns:
DataFrame of signal values prevailing as of input date
"""
return self.get_section(dataset=label, fields=[rebaldate, label],
date_field=rebaldate, date=date, start=start)
[docs] def table_key(self, label: str) -> str:
"""Helper method generates a table key name for the input label"""
return '__' + label # prefix with "__"
[docs] def __getitem__(self, label) -> Table:
"""Overrides parent class method to get Table schema of label"""
return self.sql.Table(self.table_key(label),
Column('permno', Integer, primary_key=True),
Column('rebaldate', Integer, primary_key=True),
Column(label, Float))
[docs] def summary(self, label: str) -> DataFrame:
"""Perform a 'proc summary' by rebaldate on a signal's values"""
return self.sql.summary(self.table_key(label), label, key='rebaldate')
[docs] def write(self,
data: DataFrame,
label: str,
overwrite: bool = True,
rebaldate: str = 'rebaldate',
permno: str = 'permno') -> int:
"""Saves a new sql table from dataframe of signal values
Args:
data: Signal values, with columns ['permno', 'rebaldate', label]
label: Signal name of column and table (prefixed '__')
overwrite: If False, append to table ignoring dups. Else recreate
rebaldate: Column name of rebalance dates in input dataframe
permno: Column name of permno identifiers in input dataframe
Returns:
Number of rows saved
Notes:
- first removes dup keys, then drops null rows before saving to table
"""
df = data[[permno, rebaldate, label]].copy()
df.index.name = None # 'permno' may be both index level or column label
df = df.rename(columns={permno: 'permno', rebaldate: 'rebaldate'})
table = self[label]
df = as_dtypes(df=df, columns={k.lower(): v.type
for k, v in table.columns.items()})
df = df.replace([np.inf, -np.inf], np.nan)
df = df.sort_values(by=['permno', 'rebaldate', label])
df.drop_duplicates(['permno', 'rebaldate'], keep='first', inplace=True)
df = df.dropna() # NaN's last
if overwrite:
table.drop(self.sql.engine, checkfirst=True)
table.create(self.sql.engine, checkfirst=True)
#self.sql.create_all()
self.sql.load_dataframe(table=table.key, df=df, index_label=None)
self._print("(signals_write)", label, len(df))
return len(df)
[docs] def read(self, label: str, where: str = '') -> DataFrame:
"""Read signal values from sql and return as data frame
Args:
label : Name of signal
where : Where clause for sql select
Returns:
DataFrame of query with columns = ['permno', 'rebaldate', label]
"""
if where:
where = 'WHERE' + where
table = self.table_key(label)
q = f"SELECT permno, rebaldate, {label} FROM {table} {where}"
return self.sql.read_dataframe(q).sort_values(['permno', 'rebaldate'])
[docs]class SignalsFrame(Signals):
"""Cache dataframe of signals values, provide Signals-like interface
Args:
df : DataFrame input with permno and rebaldate as columns
identifer : column name of permno identifiers
rebaldate : column name of rebalance dates
"""
def __init__(self, df: DataFrame, identifier: str = 'permno',
rebaldate: str = 'rebaldate'):
"""Initialize instance from input dataframe"""
self.data = df
self.identifier = identifier
self.rebaldate = rebaldate
[docs] def __call__(self, label: str, date: int, start: int = -1) -> DataFrame:
"""Select from rebaldates that fall between start and date, keep latest
Args:
label: Name of column to return
date: As of this date back through (non-inclusive) start date
start: Non-inclusive start date. Set to 0 for all, -1 for exact
"""
if start < 0:
start = date - 1
df = self.data.loc[self.data[self.rebaldate].le(date)
& self.data[self.rebaldate].gt(start),
[self.identifier, self.rebaldate, label]]
df = df.sort_values([self.identifier, self.rebaldate], na_position='first')\
.drop_duplicates([self.identifier], keep='last')\
.dropna()
return df.set_index(self.identifier)