Skip to content
Snippets Groups Projects
Unverified Commit 98bd3345 authored by Sourabh Desai's avatar Sourabh Desai Committed by GitHub
Browse files

Sourabh/bugfix/sec downloader update (#57)


* Updated sec-edge-downloader to version 5.0.2

* pull SEC EDGAR user agent details from .env

---------

Co-authored-by: default avatardr-BEat <dr-BEat@users.noreply.github.com>
parent 887c3db9
No related branches found
No related tags found
No related merge requests found
......@@ -9,3 +9,5 @@ CDN_BASE_URL=http://llama-app-web-assets-local.s3-website.localhost.localstack.c
AWS_KEY=xxx
AWS_SECRET=xxx
POLYGON_IO_API_KEY=xxx
SEC_EDGAR_COMPANY_NAME=YourOrgName
SEC_EDGAR_EMAIL=you@example.com
......@@ -20,6 +20,7 @@ Live at https://secinsights.ai/
1. Lastly, you will likely want to populate your local database with some sample SEC filings
- We have a script for this! But first, open your `.env` file and replace the placeholder value for the `OPENAI_API_KEY` with your own OpenAI API key
- At some point you will want to do the same for the other secret keys in here like `POLYGON_IO_API_KEY`, `AWS_KEY`, & `AWS_SECRET`
- To follow the [SEC's Internet Security Policy](https://www.sec.gov/os/webmaster-faq#code-support), make sure to also replace the `SEC_EDGAR_COMPANY_NAME` & `SEC_EDGAR_EMAIL` values in the `.env` file with your own values.
- Source the file again with `set -a` then `source .env`
- Run `make seed_db_local`
- If this step fails, you may find it helpful to run `make refresh_db` to wipe your local database and re-start with emptied tables.
......
import os
from enum import Enum
from typing import List, Union, Optional
from pydantic import BaseSettings, AnyHttpUrl, validator
from pydantic import BaseSettings, AnyHttpUrl, EmailStr, validator
from multiprocessing import cpu_count
......@@ -79,6 +79,8 @@ class Settings(PreviewPrefixedSettings):
SENTRY_DSN: Optional[str]
RENDER_GIT_COMMIT: Optional[str]
LOADER_IO_VERIFICATION_STR: str = "loaderio-e51043c635e0f4656473d3570ae5d9ec"
SEC_EDGAR_COMPANY_NAME: str = "YourOrgName"
SEC_EDGAR_EMAIL: EmailStr = "you@example.com"
# BACKEND_CORS_ORIGINS is a JSON-formatted list of origins
# e.g: '["http://localhost", "http://localhost:4200", "http://localhost:3000", \
......
This diff is collapsed.
......@@ -31,6 +31,7 @@ polygon-api-client = "^1.12.3"
nltk = "^3.8.1"
cachetools = "^5.3.1"
greenlet = "^2.0.2"
email-validator = "^2.0.0.post2"
[tool.poetry.group.dev.dependencies]
......@@ -39,7 +40,7 @@ pytest = "^7.3.2"
sseclient-py = "^1.7.2"
pdfkit = "^1.0.0"
fire = "^0.5.0"
sec-edgar-downloader = "^4.3.0"
sec-edgar-downloader = "~5.0"
pytickersymbols = "^1.13.0"
awscli-local = "^0.20"
......
......@@ -7,6 +7,7 @@ from fire import Fire
from sec_edgar_downloader import Downloader
from distutils.spawn import find_executable
from tqdm.contrib.itertools import product
from app.core.config import settings
DEFAULT_OUTPUT_DIR = "data/"
# You can lookup the CIK for a company here: https://www.sec.gov/edgar/searchedgar/companysearch
......@@ -47,10 +48,10 @@ DEFAULT_FILING_TYPES = [
def _download_filing(
cik: str, filing_type: str, output_dir: str, amount=None, before=None, after=None
cik: str, filing_type: str, output_dir: str, limit=None, before=None, after=None
):
dl = Downloader(output_dir)
dl.get(filing_type, cik, amount=amount, before=before, after=after)
dl = Downloader(settings.SEC_EDGAR_COMPANY_NAME, settings.SEC_EDGAR_EMAIL, output_dir)
dl.get(filing_type, cik, limit=limit, before=before, after=after, download_details=True)
def _convert_to_pdf(output_dir: str):
......@@ -62,15 +63,16 @@ def _convert_to_pdf(output_dir: str):
# │ ├── AAPL
# │ │ ├── 10-K
# │ │ │ ├── 0000320193-20-000096
# │ │ │ │ ├── filing-details.html
# │ │ │ │ ├── filing-details.pdf <-- this is what we want
# │ │ │ │ ├── primary-document.html
# │ │ │ │ ├── primary-document.pdf <-- this is what we want
data_dir = Path(output_dir) / "sec-edgar-filings"
for cik_dir in data_dir.iterdir():
for filing_type_dir in cik_dir.iterdir():
for filing_dir in filing_type_dir.iterdir():
filing_doc = filing_dir / "filing-details.html"
filing_pdf = filing_dir / "filing-details.pdf"
filing_doc = filing_dir / "primary-document.html"
filing_pdf = filing_dir / "primary-document.pdf"
if filing_doc.exists() and not filing_pdf.exists():
print("- Converting {}".format(filing_doc))
input_path = str(filing_doc.absolute())
......@@ -87,7 +89,7 @@ def main(
file_types: List[str] = DEFAULT_FILING_TYPES,
before: Optional[str] = None,
after: Optional[str] = None,
amount: Optional[int] = 3,
limit: Optional[int] = 3,
convert_to_pdf: bool = True,
):
print('Downloading filings to "{}"'.format(Path(output_dir).absolute()))
......@@ -105,7 +107,7 @@ def main(
print(f"- Filing for {symbol} {file_type} already exists, skipping")
else:
print(f"- Downloading filing for {symbol} {file_type}")
_download_filing(symbol, file_type, output_dir, amount, before, after)
_download_filing(symbol, file_type, output_dir, limit, before, after)
except Exception as e:
print(
f"Error downloading filing for symbol={symbol} & file_type={file_type}: {e}"
......
......@@ -120,7 +120,7 @@ def get_available_filings(output_dir: str) -> List[Filing]:
for cik_dir in data_dir.iterdir():
for filing_type_dir in cik_dir.iterdir():
for filing_dir in filing_type_dir.iterdir():
filing_pdf = filing_dir / "filing-details.pdf"
filing_pdf = filing_dir / "primary-document.pdf"
full_submission_txt = filing_dir / "full-submission.txt"
if filing_pdf.exists():
filing_type = filing_type_dir.name
......
......@@ -23,7 +23,7 @@ DEFAULT_DOC_DIR = "data/"
async def upsert_document(doc_dir: str, stock: Stock, filing: Filing, url_base: str):
# construct a string for just the document's sub-path after the doc_dir
# e.g. "sec-edgar-filings/AAPL/10-K/0000320193-20-000096/filing-details.pdf"
# e.g. "sec-edgar-filings/AAPL/10-K/0000320193-20-000096/primary-document.pdf"
doc_path = Path(filing.file_path).relative_to(doc_dir)
url_path = url_base.rstrip("/") + "/" + str(doc_path).lstrip("/")
doc_type = (
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment