SEC EDGAR Database Downloader - python

I need to download all 8-K filings in SEC Edgar Database (approximately 3500 companies), anyone knows how to do it with a software or a code?
I tried with the sec-edgar-downloader (https://pypi.org/project/sec-edgar-downloader) and it is a very good software, but it only allows me to download a single company 8-K filings.
I also have this code, but I don't do programming so I don't understand it very much, does this code do as I requested, and how to use it?
Thank you in advance.
import pandas as pd
import gc
import glob
import datetime
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import os, csv, time
from bs4 import BeautifulSoup as bs
import re
import sys
#import edgar # you only need this and the next in the first time you download the index #edgar.download_index(path_sec, 2000) # ... where '2000' is the first year of the period from which you want the data
# This function provides a connection object that is more efficient def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None,):
if __name__ == '__main__':
pass
import requests
from requests.adapters import HTTPAdapter
#from requests.packages.urllib3.util.retry import Retry
from urllib3.util.retry import Retry
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def creates_df(tsv_folder,file_type,st_year=2009,lst_year=datetime.datetime.today().year):
''' This function creates a file with the SEC urls necessary for your work.
Start date must be in the YYYY format. Default is 2009. Default end_year is today\'s year.
tsv_folder is the place where your TSV files are, the full path.
file_type is the SEC file type you want to get, e.g., 8-K or DEFM14A, always between quotes.
Destination folder for the output CSV file is your current directory.'''
if __name__ == '__main__':
pass
last_year = lst_year
path_edgar = tsv_folder
typefile = file_type
start_year = st_year
destination = os.getcwd()
print(f'Saving files to {destination}.')
list_files = []
write_cols = True
for file in glob.glob(path_edgar + '*.tsv'):
if int(file[-13:-9]) >= int(start_year) and int(file[-13:-9]) <= int(last_year):
list_files.append(file)
for file_sec in list_files:
try:
print(f'Trying to read {file_sec}.')
x = pd.read_csv(file_sec, sep='|',dtype=str,names=['cik', 'firm_name','file_type','report_date','file_url_txt','file_url_html'])
print('Done. Processing...')
x = x[x['file_type'] == typefile]
for i,j in x.iterrows():
if len(j[0]) < 10:
x.loc[i,'cik'] = '0' * (10 - len(j[0])) + str(j[0])
print('Writing...')
x.to_csv(destination+'/sec_dataframe.csv',header = write_cols, mode='a',index=False)
write_cols = False
except Exception as ex:
print('Can\'t read this file: ' + str(file_sec))
print('Python returned this message: '+str(type(ex).__name__),str(ex.args)+'.')
def id_8k(path_to_file,item8k):
'''This function identifies the 8-K filing that have the respective wanted item.
It assumes you have a csv file extracted from the function creates_df. You need to
provide the path to this file as first parameter and the 8-K item as second parameter.
The function then reads 100,000 rows at a time from the file and processes the results.'''
if __name__ == '__main__':
pass
for chunk in pd.read_csv(path_to_file,chunksize=100000,dtype=str,parse_dates=['report_date']):
for row,col in chunk.assign(
keep=[1 if dt.date().year >= 2019 else 0 for dt in chunk.report_date]).query("keep == 1").iterrows():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_html'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got soup object from: ',str(col['file_url_html']),str(col['cik']))
if soup.text and str(item8k) in soup.text.lower():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_txt'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got your filing item from: ',str(col['file_url_txt']),str(col['cik']))
try:
with open(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',str(col['firm_name'])))+'_'+
str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html','a') as file:
file.write(soup.prettify())
print('html file is done. Name: ',str(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',
str(col['firm_name'])))+'_'+str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html'))
except Exception as ex:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue

Create a list of your company names (or symbols, or CIKS). If you have a list in excel, convert it to a csv and do:
companies = []
with open('/Path', newline='', encoding='utf-
8-sig') as f:
for row in csv.reader(f):
companies.append(row[0])
Then, rifle through that list to grab the files:
dl = Downloader(Path)
for company in companies:
dl.get("File Type"), company

You can also use SEC Filings API.
You can retrieve both real-time and historical SEC filings data.
It covers all types of all publicly listed companies, mutual funds and other private placements.

Related

getting weird results from metapub and pubmed

I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue
I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)

Extract only specific text from PDF using Python

Need to extract the specific text only from Invoice PDF file having different PDF structure using python and store the output data into particular excel columns. All the PDF files have different structure but same content values.
Tried to solve it but not able to extract the specific text values only.
Sample PDF file :
Click to view the sample file
Need to Extract Invoice ID, Issue Date, Subject, Amount Due from the whole PDF file.
Script i have used so far:
import PyPDF2
import re
pdfFileObj = open('test.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
text = str(pageObj.extractText())
quotes = re.findall(r'"[^"]*"',text)
print(quotes)
You have a very nice pdf document, because your pdf has form fields, so you can use them directly to read the data:
import PyPDF2
pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
fields = pdfReader.getFormTextFields()
print(fields["Invoice ID"])
print(fields["Issue Date"])
print(fields["Subject"])
print(fields["Amount Due"])
EDIT:
I combined your requested data (from here: How to extract only specific text from PDF file using python) in a little script with 3 opportunities of parsing the pdf (for your 3 pdfs). The problem is your pdfs have a lot of differences and the packages have some advantages on different pdfs, so i think you have to combine this stuff. The thing is, that you try all functions, till it gets a result. I hope this is an good start for you. You may have to change the regexes, if you have more different pdfs and may you have to store all regex (per field) in an array and use them on the different functions so you have 3 functions for parsing and 4 lists of regexes to use in 2 of the functions.
import PyPDF2
import re
import os
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
def parse_pdf_by_regex_2(filename: str) -> dict:
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
regex_invoice_no = re.compile(r"Invoice No.:\s*(\w+)\s")
regex_order_no = re.compile(r"IRN:\s*(\d+)")
regex_due_date = re.compile(r"Due Date: (\d{2}\.\d{2}\.\d{4})")
regex_total_due = re.compile(r"([\d,.]+) \n\nTotal Invoice Value\(in words\)")
try:
return {"invoice_id": re.search(regex_invoice_no, output_string.getvalue()).group(1),
"issue_date": re.search(regex_due_date, output_string.getvalue()).group(1),
"subject": re.search(regex_order_no, output_string.getvalue()).group(1),
"amount": re.search(regex_total_due, output_string.getvalue()).group(1)}
except AttributeError as err:
print("Not all elements have been found")
return {}
def parse_pdf_by_form_fields(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
try:
fields = pdf_reader.getFormTextFields()
except TypeError as err:
# print("No FormFields available")
return {}
try:
# You can also check if onyly missing some values, maybe this can happen, but this is up to your data
return {"invoice_id": fields["Invoice ID"],
"issue_date": fields["Issue Date"],
"subject": fields["Subject"],
"amount": fields["Amount Due"]}
except KeyError as err:
# print(f"Key not found: '{err.args[0]}'")
return {}
def parse_pdf_by_regex(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
text_data = ""
for page_no in range(pdf_reader.getNumPages()):
text_data += pdf_reader.getPage(page_no).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
try:
return {"invoice_id": re.search(regex_invoice_no, text_data).group(1),
"issue_date": re.search(regex_due_date, text_data).group(1),
"subject": re.search(regex_order_no, text_data).group(1),
"amount": re.search(regex_total_due, text_data).group(1)}
except AttributeError as err:
# print("Not all elements have been found")
return {}
def parse_pdf(filename: str) -> dict:
# Hint: ':=' is available since pythoon 3.8
if data := parse_pdf_by_form_fields(filename=fname):
return data
elif data := parse_pdf_by_regex(filename=fname):
return data
elif data := parse_pdf_by_regex_2(filename=fname):
return data
else:
print("No data found")
return {}
if __name__ == '__main__':
for fname in os.listdir("."):
if fname.startswith("testfile"):
print(f"check {fname}")
print(parse_pdf(filename=fname))

How can I add multiple rows in a CSV, but overwrite every time the script is run?

So I am running a script which pulls data for 60 different items, for easy use i want to store this data in a csv file. Every time i run the script i want to store all 60 items and then overwrite all previous data. When i run it at the moment using the 'w' argument it only adds the very last item it pulls. If i change it to 'a' it will add all 60 items however it will not overwrite when i run it next. Any help is appreciated!!
import os, random, csv
vin = '1600'
n = 1
for path, subdirs, files in os.walk(r'\\sorion-app01\Shares\ProcessData\Broadcasts\Good'):
for filename in files:
if not 'XX' or not '0X' or not 'XXXXXX' or not '000X' in filename:
f = os.path.join(path, filename)
vins = str(f)[50:67]
if not 'X' in vins[13:17]:
vin = int(vins[13:17])
if vin > 1600:
from sys import platform
import pyodbc
if platform == "linux" or platform == "linux2":
print("linux")
driver = 'mydriver.so'
conn = pyodbc.connect(
driver = driver,
TDS_Version = '7.2', # Use for
server = 'aserver',
port = 1433,
database = 'TraceDB',
uid = 'EXTUser',
pwd = 'EXTPass!')
elif platform == "win32":
#print("Win32")
conn = pyodbc.connect('DRIVER={SQL Server};SERVER=database')
#pull the data here#
with open('file.csv', 'w', newline='') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',',
quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator = '\n',)
header = [('column1'), ('column1'), ('column2'), ('column3'), ('column4'), ('column5')]
if not csvfile:
filewriter.writeheader(header)
filewriter.writerow([data, data2, data3, data4, data5, data6])
n=n+1
It's because you're re-opening the file every time round the loop; you want to open it before the loop and close it after the loop.
Overwriting it as a empty file at the very beginning of your code works for you?
import os, random, csv
with open("file.csv", "w") as my_empty_csv:
pass
vin = '1600' n = 1
[...]
Best!

Downloading image from url and assign it with a specified id from csv file

I have a csv file with columns: image_id, image_url
I need to download all the images from the URL and save it as the corresponding image_id as the name. Is there a way to do so?
I'm aware you can do so with python from codes that i've seen online such as
import cStringIO # *much* faster than StringIO
import urllib
import Image
try:
file =
urllib.urlopen('http://freegee.sourceforge.net/FG_EN/src/teasers_en/t_gee-power_en.gif')
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/test.gif')
except IOError, e:
raise e
but how do you reference the url and the filename from the csv
even better if i can automate the process to uploading to a GCP bucket
Appreciate any help i can get.
Cheers!
This should help. Use the csv module to parse through your CSV file.
Ex:
# -*- coding: utf-8 -*-
import csv
import cStringIO # *much* faster than StringIO
import urllib
import Image
def downloadFile(imageID, url):
try:
file = urllib.urlopen(url)
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/{0}.gif'.format(imageID))
except IOError, e:
raise e
with open('PATH_TO_.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None) # skip the headers
for row in reader:
print row
downloadFile(row[0], row[1])
I have produced a python script below. I have only tested this in python 3.4.3 but should do the trick.
Hope this helps.
import urllib, csv, requests, os
from pathlib import Path
spreadsheetAddress = 'C:\\SOURCE\\CSV\\FILE.csv'
targetDirectory = 'C:\\TARGET\\IMAGE\\SAVE\\LOCATION\\'
def getSpreadsheetContents(spreadsheetAddress):
with open(spreadsheetAddress) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
imageSet = {}
for row in readCSV:
if 'image_id' not in row:
imageSet[row[0]] = row[1]
return imageSet
if __name__ == "__main__":
if os.path.exists(spreadsheetAddress) and os.path.exists(targetDirectory):
imageDict = getSpreadsheetContents(spreadsheetAddress)
for key, value in imageDict.items():
if requests.get(value).status_code == 200:
filename, file_extension = os.path.splitext(value)
address = str(targetDirectory + "\\" + key + file_extension)
urllib.request.urlretrieve(value, address)
else:
raise Exception("File not found")

NVD - JSON to CSV with Python

I am trying to download the NVD CVE. Here is my pythoncode:
import requests
import re
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip",r.text):
print(filename)
r_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
with open("nvd/" + filename, 'wb') as f:
for chunk in r_file:
f.write(chunk)
Now I want to write all JSON-files ina csv-file with this format:
Name, Value, Description, ..., ...
Name, Value, Description, ..., ...
Can somebody help me?
The following should get you started, giving you two columns, ID,VendorName,DescriptionandVendorValues`:
import requests
import re
import zipfile
import io
import json
import csv
with open("nvdcve-1.0-2017.json") as f_json:
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ID', 'VendorName', 'Description', 'VersionValues'])
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip", r.text):
print("Downloading {}".format(filename))
r_zip_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
zip_file_bytes = io.BytesIO()
for chunk in r_zip_file:
zip_file_bytes.write(chunk)
zip_file = zipfile.ZipFile(zip_file_bytes)
for json_filename in zip_file.namelist():
print("Extracting {}".format(json_filename))
json_raw = zip_file.read(json_filename).decode('utf-8')
json_data = json.loads(json_raw)
for entry in json_data['CVE_Items']:
try:
vendor_name = entry['cve']['affects']['vendor']['vendor_data'][0]['vendor_name']
except IndexError:
vendor_name = "unknown"
try:
url = entry['cve']['references']['reference_data'][0]['url']
except IndexError:
url = ''
try:
vv = []
for pd in entry['cve']['affects']['vendor']['vendor_data'][0]['product']['product_data']:
for vd in pd['version']['version_data']:
vv.append(vd['version_value'])
version_values = '/'.join(vv)
except IndexError:
version_values = ''
csv_output.writerow([
entry['cve']['CVE_data_meta']['ID'],
url,
vendor_name,
entry['cve']['description']['description_data'][0]['value'],
version_values])
This downloads the zipfile into memory. It then extracts all files one at a time into memory and converts the json into a Python datas structure using json.loads(). For each entry in CVE_Items it then extracts a couple of the fields and writes them to a CSV file.
As the JSON data is highly structured, you will need to consider how you would want to represent all of the fields in a CSV file. Currently it extras two "useful" fields and stores those.
Alternatively instead of making your own CSV you could work with Pandas:
df = pd.read_json(json_raw)
df.to_csv(f_output)
Remove the csv_output lines. This though would need some extra work to decide on how it should be formatted.

Categories

Resources