Related
I want to use textract (via aws cli) to extract tables from a pdf file (located in an s3 location) and export it into a csv file. I have tried writing a .py script but am struggling to read from the file.
Any suggestions for writing the .py script is welcome.
This is my current script. I run into the error:
File "extract-table.py", line 63, in get_table_csv_results
bash: File: command not found
blocks=response['Blocks']
KeyError: 'Blocks'
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
#Response
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
})
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
# Document
s3BucketName = "chrisyou.sagemi.com"
documentName = "DETAIL.pdf"
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
There is a much simpler way using the Amazon Textractor Textractor library. pip install amazon-textract-textractor
This will create a csv per table in your pdf document. e.g output_p0_t0.csv
from textractor import Textractor
def extract_tables(s3_file_path, output_directory, s3_output_path):
extractor = Textractor(profile_name="default")
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
I had to make slight changes to #Thomas answer by importing profile `
extractor = Textractor(profile_name="default") right after importing Textractor as shown below to avoid getting this error -> NameError: name 'textractor' is not defined.
from textractor import Textractor
extractor = Textractor(profile_name="default")
def extract_tables(s3_file_path, output_directory, s3_output_path):
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
Hope it helps someone out there.
number of points 100,000,000 (4GB)
I am reading a CSV file and saving the data separate CSV file.
I'm using import csv.reader, which is working fine. But this code I noticed that it takes too much time.
How can I improve the performance of my task?
Please provide me with alternative options.
Performance is the main concern here.
from shapely.geometry import Point, Polygon
import csv
import os
req1 = input("path of the CSV file: ")
file_name = os.path.splitext(req1)
file_name = os.path.split(file_name[0])
path = file_name[0]
file_name = file_name[1]
with open(req1, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
os.makedirs(path + "/" + file_name + "_output", exist_ok=True)
outpath = path + "/" + file_name + "_output" + "/"
coords = [[19.803499,15.2265],[-35.293499,33.7495],
[-49.6675,33.726501],[-48.022499,20.4715],
[-36.336498,-4.925],[-32.6105,-45.494499],
[-10.5275,-38.3815],[-11.93835,-20.8235],
[26.939501,-18.095501],[19.803499,15.2265]]
poly = Polygon(coords)
for row in reader:
geom = Point(float(row[0]),float(row[1])) # Considering the order of elements that you gave
x = float(row[0])
y = float(row[1])
z = float(row[2])
r = int(row[3])
g = int(row[4])
b = int(row[5])
i = int(row[6])
result = geom.within(poly)
if str(result) == 'True':
with open(outpath + file_name + "_TRUE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
print('True', str(x),str(y),str(z))
else:
with open(outpath + file_name + "_FALSE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
#print('False', str(x),str(y),str(z))
I used [pd.read_csv] instead of [import csv.reader].
So the performance has been improved a bit.
However, I tried to do Python multiprocessing,
but I don't understand it well.
Process result time (1234 sec -> 31 sec)
import pandas as pd
from shapely.geometry import *
data = pd.read_csv("/sample.csv")
poly = Polygon([(-0.7655,-22.758499), (17.0525,-21.657499), (16.5735,-26.269501), (0.4755,-28.6635)])
cord = data.values.tolist()
for i in cord:
print(poly.intersects(Point(i[0], i[1])), i)
for example code of Python Multiprocessing Pools
enter link description here
import time
from multiprocessing import Pool
def f(x):
time.sleep(2) # Wait 2 seconds
print(x*x)
p = Pool(8)
p.map(f, [1, 2, 3, 4])
p.close()
p.join()
How should I apply this?
I need to download all 8-K filings in SEC Edgar Database (approximately 3500 companies), anyone knows how to do it with a software or a code?
I tried with the sec-edgar-downloader (https://pypi.org/project/sec-edgar-downloader) and it is a very good software, but it only allows me to download a single company 8-K filings.
I also have this code, but I don't do programming so I don't understand it very much, does this code do as I requested, and how to use it?
Thank you in advance.
import pandas as pd
import gc
import glob
import datetime
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import os, csv, time
from bs4 import BeautifulSoup as bs
import re
import sys
#import edgar # you only need this and the next in the first time you download the index #edgar.download_index(path_sec, 2000) # ... where '2000' is the first year of the period from which you want the data
# This function provides a connection object that is more efficient def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None,):
if __name__ == '__main__':
pass
import requests
from requests.adapters import HTTPAdapter
#from requests.packages.urllib3.util.retry import Retry
from urllib3.util.retry import Retry
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def creates_df(tsv_folder,file_type,st_year=2009,lst_year=datetime.datetime.today().year):
''' This function creates a file with the SEC urls necessary for your work.
Start date must be in the YYYY format. Default is 2009. Default end_year is today\'s year.
tsv_folder is the place where your TSV files are, the full path.
file_type is the SEC file type you want to get, e.g., 8-K or DEFM14A, always between quotes.
Destination folder for the output CSV file is your current directory.'''
if __name__ == '__main__':
pass
last_year = lst_year
path_edgar = tsv_folder
typefile = file_type
start_year = st_year
destination = os.getcwd()
print(f'Saving files to {destination}.')
list_files = []
write_cols = True
for file in glob.glob(path_edgar + '*.tsv'):
if int(file[-13:-9]) >= int(start_year) and int(file[-13:-9]) <= int(last_year):
list_files.append(file)
for file_sec in list_files:
try:
print(f'Trying to read {file_sec}.')
x = pd.read_csv(file_sec, sep='|',dtype=str,names=['cik', 'firm_name','file_type','report_date','file_url_txt','file_url_html'])
print('Done. Processing...')
x = x[x['file_type'] == typefile]
for i,j in x.iterrows():
if len(j[0]) < 10:
x.loc[i,'cik'] = '0' * (10 - len(j[0])) + str(j[0])
print('Writing...')
x.to_csv(destination+'/sec_dataframe.csv',header = write_cols, mode='a',index=False)
write_cols = False
except Exception as ex:
print('Can\'t read this file: ' + str(file_sec))
print('Python returned this message: '+str(type(ex).__name__),str(ex.args)+'.')
def id_8k(path_to_file,item8k):
'''This function identifies the 8-K filing that have the respective wanted item.
It assumes you have a csv file extracted from the function creates_df. You need to
provide the path to this file as first parameter and the 8-K item as second parameter.
The function then reads 100,000 rows at a time from the file and processes the results.'''
if __name__ == '__main__':
pass
for chunk in pd.read_csv(path_to_file,chunksize=100000,dtype=str,parse_dates=['report_date']):
for row,col in chunk.assign(
keep=[1 if dt.date().year >= 2019 else 0 for dt in chunk.report_date]).query("keep == 1").iterrows():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_html'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got soup object from: ',str(col['file_url_html']),str(col['cik']))
if soup.text and str(item8k) in soup.text.lower():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_txt'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got your filing item from: ',str(col['file_url_txt']),str(col['cik']))
try:
with open(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',str(col['firm_name'])))+'_'+
str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html','a') as file:
file.write(soup.prettify())
print('html file is done. Name: ',str(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',
str(col['firm_name'])))+'_'+str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html'))
except Exception as ex:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
Create a list of your company names (or symbols, or CIKS). If you have a list in excel, convert it to a csv and do:
companies = []
with open('/Path', newline='', encoding='utf-
8-sig') as f:
for row in csv.reader(f):
companies.append(row[0])
Then, rifle through that list to grab the files:
dl = Downloader(Path)
for company in companies:
dl.get("File Type"), company
You can also use SEC Filings API.
You can retrieve both real-time and historical SEC filings data.
It covers all types of all publicly listed companies, mutual funds and other private placements.
I am trying to read and find data from pdf files with tika. I have several libreoffice and pdf files with same name but different extension.
First with this straight forward code:
from tika import parser
import os
from timeit import default_timer as timer
files_to_search = []
times = []
dir_list = os.listdir(r'\\LS-WVLEF8\backup\laskut\secun')
for file_name in dir_list:
if file_name.find('nterme')>0 and file_name.find('pdf')>0:
files_to_search.append(file_name)
for a in range(20):
tic = timer()
path_and_name=""
for item in files_to_search:
path_and_name = r'\\LS-WVLEF8\backup\laskut\secun'+'\\'+item
try:
file_data = parser.from_file(path_and_name)
text = file_data['content']
text = text.strip()
if text.find('835528')>1:
print('found '+item)
except Exception as e:
print('Exception')
print(e)
while 1:
pass
tac = timer()
times.append(tac-tic)
print('single time ',tac-tic)
with open('single.txt', 'a') as the_file:
the_file.write(str(tac-tic)+'\n')
average = sum(times)/20
max = times.index(max(times))
with open('single.txt', 'a') as the_file:
the_file.write('average = '+str(average)+'\n')
the_file.write('max = '+str(max)+'\n')
It works slowly. I get average average = 1.732
Then with this. With multiprocessing.
from tika import tika, parser
from multiprocessing import Pool
import os
from timeit import default_timer as timer
def tika_parser(files_to_search):
try:
data = parser.from_file(r'\\LS-WVLEF8\backup\laskut\secun\\'+files_to_search)
text = data['content']
text = text.strip()
if text.find('835528')>1:
print('found ' + files_to_search)
except Exception as e:
print('Exception')
print(e)
while 1:
pass
if __name__ == '__main__':
files_to_search = []
times = []
dir_list = os.listdir(r'\\LS-WVLEF8\backup\laskut\secun')
for file_name in dir_list:
if file_name.find('nterme')>0 and file_name.find('pdf')>0:
files_to_search.append(file_name)
for a in range(20):
tic = timer()
pool = Pool()
pool.map(tika_parser, files_to_search)
pool.close()
tac = timer()
times.append(tac-tic)
print('multi time ',tac-tic)
with open('multi.txt', 'a') as the_file:
the_file.write(str(tac-tic)+'\n')
average = sum(times)/20
max = times.index(max(times))
with open('multi.txt', 'a') as the_file:
the_file.write('average = '+str(average)+'\n')
the_file.write('max = '+str(max)+'\n')
This is a bit faster. I get average = 1.320
Is there a way to do this faster with tika? Or should I look for PyPDF2 or something else?
I'm translating some linux log data to a CSV for data analytics. Some of the instructions take some time so, I thought I would put in a progress bar for each file that is being translated. However, when putting in a progress bar with either the progresspar2 or tqdm, my pandas dataframes and null. There's no data at all. When I remove the progress bar, everything works as it should.
Here is my CSV translating function:
import pandas as pd
from dateutil import parser
from tqdm import trange
import os
import glob
import csv
import socket
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
file_length = len(data_file.readlines())
for i in trange(file_length, desc='loop', leave=False): # Progress Bar Via TQDM
for new_line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
TQDM is breaking something or I am implementing it incorrectly.
EDIT 1:
I figured it out. I was exhausting the file read during my readlines() to get the length. This works:
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
path, dirs, files = next(os.walk(LOGS_FILE_PATH))
log_num = len(files)
print(log_num)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
with open(filename, "r") as f:
file_length = len(f.readlines())
f.close()
pbar = tqdm(total=file_length)
for line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
pbar.update(1)
pbar.close()
you can apply tqdm for your main loop :
from tqdm import tqdm
for i in tqdam(condition):