import os, time, pandas as pd
import refinitiv.data as rd
from refinitiv.data.content import filings as rfil
CSV_FILE_PATH = "/Users/prince/Downloads/nwsheet.csv"
COMPANY_NAME_COLUMN = "Company Name"
COMPANY_ID_COLUMN = "Identifier" # RICs like AAPL.OQ, RELI.NS, etc.
DOWNLOAD_DIR = "/Users/prince/Downloads/x"
START_YEAR, END_YEAR = 2015, 2023
LSEG_API_KEY = "" # or set env var RD_APP_KEY
ANNUAL_REPORT_CATEGORY_ID = 1 # MidLevelCategory: 1 == "Annual"
def ensure_dir(p):
if not os.path.exists(p):
os.makedirs(p)
def filings_query(ric: str, y: int) -> str:
# Filings query syntax uses field:value with AND
# Common fields: RIC, DateFrom, DateTo, MidLevelCategory
return f'RIC:{ric} AND DateFrom:{y}-01-01 AND DateTo:{y}-12-31 AND MidLevelCategory:{ANNUAL_REPORT_CATEGORY_ID}'
def main():
rd.open_session(app_key=LSEG_API_KEY) # or just rd.open_session() if RD_APP_KEY env var set
ensure_dir(DOWNLOAD_DIR)
try:
df = pd.read_csv(CSV_FILE_PATH)
ids = df[COMPANY_ID_COLUMN].astype(str).tolist()
names = df[COMPANY_NAME_COLUMN].astype(str).tolist()
except FileNotFoundError:
print(f"CSV not found: {CSV_FILE_PATH}")
return
except KeyError as e:
print(f"Missing column in CSV: {e}")
return
print(f"Found {len(ids)} companies")
for ric, name in zip(ids, names):
if not ric or ric == "nan":
continue
safe = name.replace(" ", "_").replace("/", "_")
company_dir = os.path.join(DOWNLOAD_DIR, safe)
ensure_dir(company_dir)
print(f"\nProcessing {name} ({ric})")
for year in range(START_YEAR, END_YEAR + 1):
try:
query = filings_query(ric, year)
# 1) Search the Filings catalog
search_resp = rfil.search(query=query, top=5) # returns a DataFrame in .data.df
results = getattr(search_resp, "data", None)
df_res = getattr(results, "df", pd.DataFrame())
if df_res.empty:
print(f" {year}: no annual report found")
continue
# Pick the best candidate (you can filter further on Title/Language if needed)
row = df_res.iloc[0]
doc_id = row.get("documentId") or row.get("DocumentId") # naming differs across library versions
title = row.get("documentTitle") or row.get("DocumentTitle") or "Annual Report"
if not doc_id:
print(f" {year}: search returned rows without documentId")
continue
out_path = os.path.join(company_dir, f"{safe}_Annual_Report_{year}.pdf")
# 2) Download the document binary
doc_resp = rfil.get_document(document_id=doc_id) # returns .data.raw or .data.content depending on version
binary = getattr(doc_resp.data, "raw", None) or getattr(doc_resp.data, "content", None)
if not binary:
print(f" {year}: could not retrieve document content for {doc_id}")
continue
with open(out_path, "wb") as f:
f.write(binary)
print(f" {year}: downloaded '{title}' -> {out_path}")
except Exception as e:
print(f" {year}: error: {e}")
time.sleep(0.75) # be nice to the API
# close session (guard if none)
sess = rd.get_default_session()
if sess and getattr(sess, "is_open", False):
rd.close_session()
if name == "main":
main()