Source code for finds.structured.pstat

"""Compustat annual, quarterly and key developments

Copyright 2022-2024, Terence Lim

MIT License
"""
from typing import Any, Dict, List, Tuple
import numpy as np
import pandas as pd
import time
from pandas import DataFrame, Series
from pandas.api.types import is_list_like, is_integer_dtype
from sqlalchemy import Table, Column, Index
from sqlalchemy import Integer, String, Float, SmallInteger, Boolean, BigInteger
from finds.database.sql import SQL
from finds.database.redisdb import RedisDB
from finds.structured.busday import BusDay
from finds.structured.structured import Structured
_VERBOSE = 0
                 
[docs]class PSTAT(Structured): """Provide interface to Compustat structured data sets Args: sql: Connection to mysql database bd: Custom business days object name: Name of dataset is "PSTAT" identifier: Stocks identifier field name is "gvkey" Notes: - Screen on (INDFMT= 'INDL', DATAFMT='STD', POPSRC='D', and CONSOL='C') keeps majority of records and uniquely identifies GVKEY, DATADATE. also include INDFMT= 'FS', CURRENCY='USD+CAD', STATUS='ACTIVE+INACTIVE') """ _role = Series({ # Key Development role id labels 1: 'Target', 2: 'Advisor', 3: 'Buyer', 4: 'Seller', 5: 'Transaction', 6: 'Transaction Consideration', 7: 'Lender', 8: 'Participant', 9: 'TradingItemId', 10: 'Auditor', 11: 'Sponsor', 14: 'Host', }, name='role') _event = Series({ # Key Development event id labels 1: 'Seeking to Sell/Divest', # may be "not sell" 3: 'Seeking Acquisitions/Investments', 5: 'Seeking Financing/Partners', # too general, mentions banks 7: 'Bankruptcy - Other', # good: includes contemplates and motions 11: 'Delayed SEC Filings', # good 12: 'Delistings', # good, but beware of microcap 16: 'Executive/Board Changes - Other', 21: 'Discontinued Operations/Downsizings', 22: 'Strategic Alliances', 23: 'Client Announcements', 24: 'Regulatory Agency Inquiries', 25: 'Lawsuits & Legal Issues', 26: 'Corporate Guidance - Lowered', 27: 'Corporate Guidance - Raised', 28: 'Announcements of Earnings', 29: 'Corporate Guidance - New/Confirmed', 31: 'Business Expansions', 32: 'Business Reorganizations', 36: 'Buybacks', 41: 'Product-Related Announcements', 42: 'Debt Financing Related', 43: 'Restatements of Operating Results', 44: 'Labor-related Announcements', 45: 'Dividend Affirmations', 46: 'Dividend Increases', 47: 'Dividend Decreases', 48: 'Earnings Calls', 49: 'Guidance/Update Calls', 50: 'Shareholder/Analyst Calls', 51: 'Company Conference Presentations', 52: 'M&A Calls', 53: 'Stock Splits & Significant Stock Dividends', 54: 'Stock Dividends (<5%)', 55: 'Earnings Release Date', 56: 'Name Changes', 57: 'Exchange Changes', 58: 'Ticker Changes', 59: 'Auditor Going Concern Doubts', 60: 'Address Changes', 61: 'Delayed Earnings Announcements', 62: 'Annual General Meeting', 63: 'Considering Multiple Strategic Alternatives', 64: 'Ex-Div Date (Regular)', 65: 'M&A Rumors and Discussions', # 68 : 'Credit Rating - S&P - Upgrade', # 69 : 'Credit Rating - S&P - Downgrade', # 70 : 'Credit Rating - S&P - Not-Rated Action', # 71 : 'Credit Rating - S&P - New Rating', # 72 : 'Credit Rating - S&P - CreditWatch/Outlook Action', 73: 'Impairments/Write Offs', 74: 'Debt Defaults', 75: 'Index Constituent Drops', 76: 'Legal Structure Changes', 77: 'Changes in Company Bylaws/Rules', 78: 'Board Meeting', 79: 'Fiscal Year End Changes', 80: 'M&A Transaction Announcements', 81: 'M&A Transaction Closings', 82: 'M&A Transaction Cancellations', 83: 'Private Placements', 85: 'IPOs', 86: 'Follow-on Equity Offerings', 87: 'Fixed Income Offerings', 88: 'Derivative/Other Instrument Offerings', 89: 'Bankruptcy - Filing', 90: 'Bankruptcy - Conclusion', 91: 'Bankruptcy - Emergence/Exit', 92: 'End of Lock-Up Period', 93: 'Shelf Registration Filings', 94: 'Special Dividend Announced', 95: 'Index Constituent Adds', 97: 'Special/Extraordinary Shareholders Meeting', 99: 'Potential Privatization of Government Entities', 100: 'Ex-Div Date (Special)', 101: 'Executive Changes - CEO', 102: 'Executive Changes - CFO', # 103 : 'LCD Institutional Loan News', # 104 : 'LCD Trend News', # 105 : 'LCD Fallen Angel News', # 106 : 'LCD Debtor-in-possession News', # 107 : 'LCD Middle Market News', # 108 : 'LCD High-Yield Bond Story News', # 109 : 'LCD Leveraged Buyout News', # 110 : 'LCD People Story News', # 111 : 'LCD Sponsored Deal News', # 112 : 'LCD M&A News', # 113 : 'LCD Distressed News', # 114 : 'LCD Break Price News', # 115 : 'LCD Investment Grade Loan News', # 116 : 'LCD Repricing News', # 117 : 'LCD Dividend News', # 118 : 'LCD Repayment News', # 119 : 'LCD Mezzanine Debt News', # 120 : 'LCD Second-lien News', # 121 : 'LCD High-yield Europe News', # 122 : 'LCD Covenant-lite News', # 123 : 'LCD Cross-border Deal News', # 124 : 'LCD CLO News', # 125 : 'LCD Secondary Story News', # 127 : 'LCD Amendment News', # 128 : 'LCD Communications News', # 129 : 'LCD European News', # 130 : 'LCD Price-flex News', # 131 : 'LCD Global News', # 132 : 'LCD Ratings News', 134: 'Composite Units Offerings', 135: 'Structured Products Offerings', 136: 'Public Offering Lead Underwriter Change', 137: 'Spin-Off/Split-Off', 138: 'Announcements of Sales/Trading Statement', 139: 'Sales/Trading Statement Calls', 140: 'Sales/Trading Statement Release Date', # 141 : 'LCD Bids Wanted in Competition', # 142 : 'LCD Company Buys Back Outstanding Bank Debt', # 143 : 'LCD Debt Exchange', 144: 'Estimated Earnings Release Date (CIQ Derived)', # 145 : 'LCD Loan Credit Default Swap News', # 146 : 'LCD Credit Defaults Swap News', # 147 : 'LCD Default News', # 148 : 'LCD Deal Launch News', 149: 'Conferences', 150: 'Auditor Changes', 151: 'Buyback Update', 152: 'Potential Buyback', 153: 'Bankruptcy - Asset Sale/Liquidation', 154: 'Bankruptcy - Financing', 155: 'Bankruptcy - Reorganization', 156: 'Investor Activism - Proposal Related', 157: 'Investor Activism - Activist Communication', 160: 'Investor Activism - Target Communication', 163: 'Investor Activism - Proxy/Voting Related', 164: 'Investor Activism - Agreement Related', 172: 'Investor Activism - Nomination Related', 177: 'Investor Activism - Financing Option from Activist', 187: 'Investor Activism - Supporting Statements', 192: 'Analyst/Investor Day', 194: 'Special Calls', 205: 'Regulatory Authority - Regulations', 206: 'Regulatory Authority - Compliance', 207: 'Regulatory Authority - Enforcement Actions', # 208 : 'Macro: Releases', # 209 : 'Macro: General', # 210 : 'Macro: Auctions', # 211 : 'Macro: Seminars', # 212 : 'Macro: Holidays', 213: 'Dividend Cancellation', 214: 'Dividend Initiation', 215: 'Preferred Dividend', # 216 : 'S&P Events', # 217 : "Not a Keydev - Only for Timeline" 218: "Announcement of Interim Management Statement", 219: "Operating Results Release Date", 220: "Interim Management Statement Release Date", 221: "Operating Results Calls", 222: "Interim Management Statement Calls", 223: "Fixed Income Calls", 224: "Halt/Resume of Operations - Unusual Events", 225: "Corporate Guidance - Unusual Events", 226: "Announcement of Operating Results", 230: "Buyback - Change in Plan Terms", 231: "Buyback Tranche Update", 232: "Buyback Transaction Announcements", 233: "Buyback Transaction Cancellations", 234: "Buyback Transaction Closings" }, name='event') @property def role(self): """Maps keydev role id to description""" return self._role @property def event(self): """Maps keydev event id to description""" return self._event def __init__(self, sql : SQL, bd : BusDay, name : str = 'PSTAT', identifier : str = 'gvkey', verbose : int = _VERBOSE): """Initialize Compustat tables""" tables = { 'links': sql.Table( 'links', Column('gvkey', Integer, primary_key=True), Column('conm', String(30)), Column('tic', String(8)), Column('cusip', String(9)), Column('cik', Integer, default=0), Column('sic', SmallInteger, default=0), Column('naics', Integer, default=0), Column('linkprim', String(1)), Column('liid', String(3)), Column('linktype', String(2)), Column('lpermno', Integer, default=0), Column('lpermco', Integer, default=0), Column('linkdt', Integer, default=0, primary_key=True), Column('linkenddt', Integer, default=0), sql.Index('cusip', 'linkdt'), sql.Index('cik', 'linkdt'), sql.Index('lpermno', 'linkdt'), ), 'annual': sql.Table( 'annual', Column('gvkey', Integer, primary_key=True), Column('datadate', Integer, primary_key=True), Column('indfmt', String(4), primary_key=True), Column('consol', String(1), primary_key=True), Column('popsrc', String(1), primary_key=True), Column('datafmt', String(3), primary_key=True), Column('curcd', String(3), primary_key=True), Column('costat', String(1)), Column('cusip', String(9)), Column('addzip', String(16), default=0), ### NEW ### Column('cik', BigInteger, default=0), Column('fyr', SmallInteger, default=0), Column('naics', Integer, default=0), Column('sic', SmallInteger, default=0), Column('fyear', SmallInteger, default=0), Column('prcc_f', Float), Column('sich', SmallInteger, default=0), *(Column(key, Float) for key in [ 'aco', 'acox', 'act', 'ao', 'aox', 'ap', 'aqc', 'aqi', 'aqs', 'at', 'caps', 'capx', 'capxv', 'ceq', 'ceql', 'ceqt', 'ch', 'che', 'chech', 'cogs', 'cshfd', 'csho', 'cshrc', 'dc', 'dclo', 'dcpstk', 'dcvsr', 'dcvsub', 'dcvt', 'dd', 'dd1', 'dd2', 'dd3', 'dd4', 'dd5', 'dlc', 'dltis', 'dlto', 'dltp', 'dltt', 'dm', 'dn', 'do', 'dp', 'dpact', 'dpc', 'dpvieb', 'ds', 'dv', 'dvc', 'dvp', 'dvt', 'ebit', 'ebitda', 'emp', 'epsfx', 'epspx', 'esub', 'esubc', 'fatb', 'fatl', 'fca', 'fopo', 'gdwl', 'gp', 'gwo', 'ib', 'ibadj', 'ibc', 'ibcom', 'icapt', 'idit', 'intan', 'intc', 'invfg', 'invrm', 'invt', 'invwip', 'itcb', 'itci', 'ivaeq', 'ivao', 'ivch', 'ivst', 'lco', 'lcox', 'lct', 'lifr', 'lifrp', 'lo', 'lse', 'lt', 'mib', 'mibt', 'mii', 'mrc1', 'mrc2', 'mrc3', 'mrc4', 'mrc5', 'mrct', 'msa', 'ni', 'niadj', 'nopi', 'nopio', 'np', 'oancf', 'ob', 'oiadp', 'oibdp', 'pi', 'ppegt', 'ppent', 'ppeveb', 'prstkc', 'pstk', 'pstkc', 'pstkl', 'pstkn', 'pstkr', 'pstkrv', 'rea', 'reajo', 'recco', 'recd', 'rect', 'recta', 'rectr', 'reuna', 'revt', 'sale', 'scstkc', 'seq', 'spi', 'sppe', 'sstk', 'tlcf', 'tstk', 'tstkc', 'tstkn', 'txc', 'txdb', 'txdi', 'txditc', 'txfed', 'txfo', 'txp', 'txr', 'txs', 'txt', 'txw', 'wcap', 'xacc', 'xad', 'xido', 'xidoc', 'xint', 'xlr', 'xopr', 'xpp', 'xpr', 'xrd', 'xrdp', 'xrent', 'xsga' ]), ), 'quarterly': sql.Table( 'quarterly', Column('gvkey', Integer, primary_key=True), Column('datadate', Integer, primary_key=True), Column('indfmt', String(4), primary_key=True), Column('consol', String(1), primary_key=True), Column('popsrc', String(1), primary_key=True), Column('datafmt', String(3), primary_key=True), Column('costat', String(1), primary_key=True), Column('cusip', String(9)), Column('cik', BigInteger, default=0), Column('naics', Integer, default=0), Column('sic', SmallInteger, default=0), Column('datacqtr', String(6)), Column('datafqtr', String(6)), Column('fqtr', SmallInteger, default=0), Column('fyearq', SmallInteger, default=0), Column('rdq', Integer, default=0), Column('prccq', Float), *(Column(key, Float) for key in ['actq', 'atq', 'ceqq', 'cheq', 'cogsq', 'cshoq', 'dlcq', 'ibq', 'lctq', 'ltq', 'ppentq', 'pstkq', 'pstkrq', 'revtq', 'saleq', 'seqq', 'txtq', 'xsgaq']), ), 'keydev': sql.Table( 'keydev', Column('keydevid', Integer, primary_key=True), Column('companyid', Integer, default=0), Column('companyname', String(100)), Column('keydeveventtypeid', SmallInteger, primary_key=True), Column('keydevtoobjectroletypeid', SmallInteger, primary_key=True), Column('announcedate', Integer, primary_key=True), Column('gvkey', Integer, primary_key=True), ), 'customer': sql.Table( 'customer', Column('gvkey', Integer, primary_key=True), # Supplier GVKEY Column('conm', String(100)), # Supplier Name Column('cgvkey', Integer, primary_key=True), # Customer GVKEY Column('cconm', String(100)), # Cust Current Name Column('cnms', String(100)), # Customer Name Column('srcdate', Integer, primary_key=True), # Source Date Column('cid', SmallInteger, default=0), #Cust Identifier Column('sid', SmallInteger, default=0), #Cust Segment Ident Link Column('ctype', String(7)), # Customer Type Column('salecs', Float), # Customer Sales Column('scusip', String(9)), # Supplier CUSIP Column('stic', String(7)), # Supplier Ticker Symbol Column('ccusip', String(9)), # Customer CUSIP Column('ctic', String(7)), # Customer Ticker Symbol ), } super().__init__(sql, bd, tables, identifier=identifier, name=name, verbose=verbose)
[docs] def build_lookup(self, source: str, target: str, date_field='linkdt', dataset: str = 'links', fillna: Any = None) -> Any: """Build lookup function to return target identifier from source""" return super().build_lookup(source=source, target=target, date_field=date_field, dataset=dataset, fillna=fillna)
[docs] def get_permnos(self, keys: List[str], date: int, link_perm='lpermno', link_date: str ='date', permno='permno') -> DataFrame: """Return list of permnos mapped to gvkeys as of a date Args: keys: Input list of gvkeys to lookup date: Prevailing date of link """ return super().get_permnos(keys, date, link_perm='lpermno', link_date='date', permno='permno')
[docs] def get_linked(self, dataset: str, fields: List[str], date_field: str = 'datadate', link_perm: str = 'lpermno', link_date: str = 'linkdt', where: str = '', limit: int | str | None = None) -> DataFrame: """Query a pstat table, and return with linked crsp permno Args: dataset: pstat dataset to query fields : Names of fields to return date_field: Name of date field in pstat table to query link_date: Name of link date field in 'links' table link_perm: Name of permno field in 'links' table where : Sql where clause, as sql string (optional) limit : Maximum number of records to return (optional) Returns: DataFrame containing result of query Examples: >>> df = pstat.get_linked(dataset='annual', date_field='datadate', fields=['ceq','pstkrv','pstkl','pstk'], where='ceq > 0 and datadate>=19930104 and datadate<=20991231') >>> df = keydev.get_linked(dataset='keydev', date_field='announcedate', fields=['companyname', 'keydeveventtypeid', 'keydevtoobjectroletypeid'], where='', limit=''): Notes: :: select keydev.companyname, keydev.keydeveventtypeid, keydev.keydevtoobjectroletypeid, keydev.announcedate, keydev.gvkey, lpermno as permno from keydev left join links on keydev.gvkey = links.gvkey and links.linkdt = (select max(c.linkdt) as linkdt from links c where c.gvkey = keydev.gvkey and c.linkdt <= keydev.announcedate) where lpermno is not null and keydev.gvkey > 0 and links.gvkey > 0 and announcedate >= 20180301 limit 100; """ return super().get_linked(dataset=dataset, date_field=date_field, fields=fields, link_perm='lpermno', link_date='linkdt', where=where, limit=limit)
if __name__ == "__main__": from secret import credentials, paths VERBOSE = 1 sql = SQL(**credentials['sql'], verbose=VERBOSE) user = SQL(**credentials['user'], verbose=VERBOSE) rdb = RedisDB(**credentials['redis']) bd = BusDay(sql) pstat = PSTAT(sql, bd) downloads = paths['data'] / 'PSTAT' ''' # load links links_df = pstat.load_csv( 'links', downloads / 'links.txt.gz', sep='\t', drop={'lpermno': ['0', 0], 'linkprim': ['N', 'J']}, keep={'linktype': ['LC', 'LU']}, # researched and unresearched links" replace={'linkdt': (['C', 'E', 'B'], 0), 'linkenddt': (['C', 'E', 'B'], 0)}) print(len(links_df), 33036) lag = links_df.shift() f = (lag.gvkey == df.gvkey) & (lag.lpermno != links_df.lpermno) print('permnos in links changed in ', sum(f), 'of', len(links_df), 1063) # load annual annual_df = pstat.load_csv('annual', downloads / 'annual.txt.gz', sep='\t') print(len(annual_df), 464753) print(annual_df.isna().mean().sort_values().tail(5)) # load quarterly quarterly_df = pstat.load_csv('quarterly', downloads / 'quarterly.txt.gz', sep='\t') print(len(quarterly_df), 1637274) print(quarterly_df.isna().mean().sort_values().tail(5)) # load keydevs for filename in sorted(downloads.glob('keydev*.txt.gz')): tic = time.time() keydev_df = pstat.load_csv('keydev', downloads / filename, sep='\t', drop={'gvkey': [0, '0'], 'announcedate': [0, '0'], 'keydevid': [0, '0']}) print(len(keydev_df), filename, time.time() - tic) print(sql.run('select count(*) from keydev'), 12256909) ''' # load principal customers supply_df = pstat.load_csv('customer', downloads / 'supplychain.txt', sep='\t', drop={'gvkey': ['', 0, '0'], 'ctic': ['', 0, '0'], 'stic': ['', 0, '0']}) print(len(supply_df), 107114) # TODO: loop over String cols and compare lens for col in ['conm', 'cconm', 'cnms', 'ctype', 'scusip', 'stic', 'ccusip', 'ctic']: print(col, max(supply_df[col].str.len()), min(supply_df[col].str.len())) '''nanmean pstkrq 0.278155 dlcq 0.283994 lctq 0.368958 actq 0.373740 xsgaq 0.385008 xlr 0.775819 ob 0.827637 cshrc 0.859283 scstkc 0.974569 gwo 0.977663 '''