Reading pdf files faster with tika Python 3 - python

I am trying to read and find data from pdf files with tika. I have several libreoffice and pdf files with same name but different extension.
First with this straight forward code:
from tika import parser
import os
from timeit import default_timer as timer
files_to_search = []
times = []
dir_list = os.listdir(r'\\LS-WVLEF8\backup\laskut\secun')
for file_name in dir_list:
if file_name.find('nterme')>0 and file_name.find('pdf')>0:
files_to_search.append(file_name)
for a in range(20):
tic = timer()
path_and_name=""
for item in files_to_search:
path_and_name = r'\\LS-WVLEF8\backup\laskut\secun'+'\\'+item
try:
file_data = parser.from_file(path_and_name)
text = file_data['content']
text = text.strip()
if text.find('835528')>1:
print('found '+item)
except Exception as e:
print('Exception')
print(e)
while 1:
pass
tac = timer()
times.append(tac-tic)
print('single time ',tac-tic)
with open('single.txt', 'a') as the_file:
the_file.write(str(tac-tic)+'\n')
average = sum(times)/20
max = times.index(max(times))
with open('single.txt', 'a') as the_file:
the_file.write('average = '+str(average)+'\n')
the_file.write('max = '+str(max)+'\n')
It works slowly. I get average average = 1.732
Then with this. With multiprocessing.
from tika import tika, parser
from multiprocessing import Pool
import os
from timeit import default_timer as timer
def tika_parser(files_to_search):
try:
data = parser.from_file(r'\\LS-WVLEF8\backup\laskut\secun\\'+files_to_search)
text = data['content']
text = text.strip()
if text.find('835528')>1:
print('found ' + files_to_search)
except Exception as e:
print('Exception')
print(e)
while 1:
pass
if __name__ == '__main__':
files_to_search = []
times = []
dir_list = os.listdir(r'\\LS-WVLEF8\backup\laskut\secun')
for file_name in dir_list:
if file_name.find('nterme')>0 and file_name.find('pdf')>0:
files_to_search.append(file_name)
for a in range(20):
tic = timer()
pool = Pool()
pool.map(tika_parser, files_to_search)
pool.close()
tac = timer()
times.append(tac-tic)
print('multi time ',tac-tic)
with open('multi.txt', 'a') as the_file:
the_file.write(str(tac-tic)+'\n')
average = sum(times)/20
max = times.index(max(times))
with open('multi.txt', 'a') as the_file:
the_file.write('average = '+str(average)+'\n')
the_file.write('max = '+str(max)+'\n')
This is a bit faster. I get average = 1.320
Is there a way to do this faster with tika? Or should I look for PyPDF2 or something else?

Related

How to make a path where the files will be created in?

I made a program that would create new text files every second but how can I make a path when those text files would be created in? (Example: Desktop)
Here is my code:
from time import sleep
import string
import random
def getR():
p = ""
for _ in range(0, 10):
p += random.choice(string.ascii_letters)
return p
getR()
n = input("Are you sure you want to start this program?. \n")
if n == "yes":
if __name__ == "__main__":
while True:
f = open("{}.txt".format(getR()), "w+")
f.close()
sleep (1)
else:
print("Closing the file...")
This does what you ask, although why you want to create an infinite number of randomly named empty files on your desktop is beyond me.
import os
from time import sleep
import string
import random
def getR():
return ''.join(random.choice(string.ascii_letters) for _ in range(10)
if __name__ == "__main__":
desktop = os.environ['USERPROFILE'] + "/Desktop/"
while True:
f = open(desktop + "{}.txt".format(getR()), "w+")
f.close()
sleep (1)
To change the working directory use os.chdir:
import os
os.chdir(PATH)
Or you can specify the path directly on open:
file = os.path.join(PATH, FILENAME)
f = open(file, 'w+')

Multi threading andd save csv file

i try get requeses multi threading and save to data
but when the multi threading in use itss not save the csv
what can do ?
Must add time.sleep? He can not do the
saving and at the same time I would be happy for help thanks
this the script
import requests
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
import json
url_list = []
with open('bookslinks.csv', newline='') as f:
reader = csv.reader(f)
# urls = list(reader)
for row in reader:
url_list.append(row[0])
def download_file(url):
string = url
say = "="
after = string[string.index(say) + len(say):]
bugeycheck = "https://xxxx/" + after + "data=" + after
j = json.loads(requests.get(bugeycheck, timeout=20,stream=True).content);
dataname = "bbbb/" + str(after) + "bbbb" + ".txt"
print(j["xx"])
with open('beneficiary.csv', 'a') as newFile:
newFileWriter = csv.writer(newFile)
newFileWriter.writerow([after, j["xx"]])
return
start = time()
processes = []
with ThreadPoolExecutor(max_workers=100) as executor:
for url in url_list:
processes.append(executor.submit(download_file, url))
for task in as_completed(processes):
print("test")
print(f'Time taken: {time() - start}')
the console python when the multi threading use
#####
false
false
falsefalse
false
####
false = add to csv file
false = add to csv file
falsefalse = add only one to csv file
false = add to csv file

python point inside polygon (point cloud data)

number of points 100,000,000 (4GB)
I am reading a CSV file and saving the data separate CSV file.
I'm using import csv.reader, which is working fine. But this code I noticed that it takes too much time.
How can I improve the performance of my task?
Please provide me with alternative options.
Performance is the main concern here.
from shapely.geometry import Point, Polygon
import csv
import os
req1 = input("path of the CSV file: ")
file_name = os.path.splitext(req1)
file_name = os.path.split(file_name[0])
path = file_name[0]
file_name = file_name[1]
with open(req1, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
os.makedirs(path + "/" + file_name + "_output", exist_ok=True)
outpath = path + "/" + file_name + "_output" + "/"
coords = [[19.803499,15.2265],[-35.293499,33.7495],
[-49.6675,33.726501],[-48.022499,20.4715],
[-36.336498,-4.925],[-32.6105,-45.494499],
[-10.5275,-38.3815],[-11.93835,-20.8235],
[26.939501,-18.095501],[19.803499,15.2265]]
poly = Polygon(coords)
for row in reader:
geom = Point(float(row[0]),float(row[1])) # Considering the order of elements that you gave
x = float(row[0])
y = float(row[1])
z = float(row[2])
r = int(row[3])
g = int(row[4])
b = int(row[5])
i = int(row[6])
result = geom.within(poly)
if str(result) == 'True':
with open(outpath + file_name + "_TRUE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
print('True', str(x),str(y),str(z))
else:
with open(outpath + file_name + "_FALSE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
#print('False', str(x),str(y),str(z))
I used [pd.read_csv] instead of [import csv.reader].
So the performance has been improved a bit.
However, I tried to do Python multiprocessing,
but I don't understand it well.
Process result time (1234 sec -> 31 sec)
import pandas as pd
from shapely.geometry import *
data = pd.read_csv("/sample.csv")
poly = Polygon([(-0.7655,-22.758499), (17.0525,-21.657499), (16.5735,-26.269501), (0.4755,-28.6635)])
cord = data.values.tolist()
for i in cord:
print(poly.intersects(Point(i[0], i[1])), i)
for example code of Python Multiprocessing Pools
enter link description here
import time
from multiprocessing import Pool
def f(x):
time.sleep(2) # Wait 2 seconds
print(x*x)
p = Pool(8)
p.map(f, [1, 2, 3, 4])
p.close()
p.join()
How should I apply this?

divide a disk image into smaller parts using Python

I would like to write a program that takes a .dmg file that is 1.6 GB and split it into 100 MB chunks.
I would like to also write another program that later can put everything back together so that it can be mounted and used.
I am very new to Python (and any type of programming language in general) and cannot find anything on here about this specific thing. Let me know if I am using incorrect terminology too so that I can learn how to search more effectively.
Thanks!
Try this example:
split.py
import sys, os
kilobytes = 1024
megabytes = kilobytes * 1000
chunksize = int(1.4 * megabytes)
def split(fromfile, todir, chunksize=chunksize):
if not os.path.exists(todir):
os.mkdir(todir)
else:
for fname in os.listdir(todir):
os.remove(os.path.join(todir, fname))
partnum = 0
input = open(fromfile, 'rb')
while 1:
chunk = input.read(chunksize)
if not chunk: break
partnum = partnum+1
filename = os.path.join(todir, ('part%04d' % partnum))
fileobj = open(filename, 'wb')
fileobj.write(chunk)
fileobj.close()
input.close( )
assert partnum <= 9999
return partnum
if __name__ == '__main__':
try:
parts = split('/Users/example/Desktop/SO/st/example.mp4', '/Users/example/Desktop/SO/st/new', 2000000) # 100000000 == 100 mb
except:
print('Error during split')
for join:
join.py
import os, sys
readsize = 1024
def join(fromdir, tofile):
output = open(tofile, 'wb')
parts = os.listdir(fromdir)
parts.sort( )
for filename in parts:
filepath = os.path.join(fromdir, filename)
fileobj = open(filepath, 'rb')
while 1:
filebytes = fileobj.read(readsize)
if not filebytes: break
output.write(filebytes)
fileobj.close( )
output.close( )
if __name__ == '__main__':
try:
join('/Users/example/Desktop/SO/st/new', 'example_join.mp4')
except:
print('Error joining files:')
else:
print('Join complete!')

Downloading files concurrently in Python

This code downloads metadata from a repository, writes that data to file, downloads a pdf, turns that pdf to text, then deletes the original pdf:
for record in records:
record_data = [] # data is stored in record_data
for name, metadata in record.metadata.items():
for i, value in enumerate(metadata):
if value:
record_data.append(value)
fulltext = ''
file_path = ''
file_path_metadata = ''
unique_id = str(uuid.uuid4())
for data in record_data:
if 'Fulltext' in data:
# the link to the pdf
fulltext = data.replace('Fulltext ', '')
# path where the txt file will be stored
file_path = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '.pdf'
# path where the metadata will be stored
file_path_metadata = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '_metadata.txt'
print fulltext, file_path
# Write metadata to file
if fulltext:
try:
write_metadata = open(path_to_institute + file_path_metadata, 'w')
for i, data in enumerate(record_data):
write_metadata.write('MD_' + str(i) + ': ' + data.encode('utf8') + '\n')
write_metadata.close()
except Exception as e:
# Exceptions due to missing path to file
print 'Exception when writing metadata: {}'.format(e)
print fulltext, path_to_institute, file_path_metadata
# Download pdf
download_pdf(fulltext, path_to_institute + file_path)
# Create text file and delete pdf
pdf2text(path_to_institute + file_path)
Doing some measurements, the download_pdf method and pdf2text method takes quite a long time.
Here are those methods:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import os
def remove_file(path):
try:
os.remove(path)
except OSError, e:
print ("Error: %s - %s." % (e.filename,e.strerror))
def pdf2text(path):
string_handling = StringIO()
parser = PDFParser(open(path, 'r'))
save_file = open(path.replace('.pdf', '.txt'), 'w')
try:
document = PDFDocument(parser)
except Exception as e:
print '{} is not a readable document. Exception {}'.format(path, e)
return
if document.is_extractable:
recourse_manager = PDFResourceManager()
device = TextConverter(recourse_manager,
string_handling,
codec='ascii',
laparams=LAParams())
interpreter = PDFPageInterpreter(recourse_manager, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# write to file
save_file.write(string_handling.getvalue())
save_file.close()
# deletes pdf
remove_file(path)
else:
print(path, "Warning: could not extract text from pdf file.")
return
def download_pdf(url, path):
try:
f = urllib2.urlopen(url)
except Exception as e:
print e
f = None
if f:
data = f.read()
with open(path, "wb") as code:
code.write(data)
code.close()
So I'm thinking I should run those in parallel.
I tried this, but it did not word:
pool = mp.Pool(processes=len(process_data))
for i in process_data:
print i
pool.apply(download_pdf, args=(i[0], i[1]))
pool = mp.Pool(processes=len(process_data))
for i in process_data:
print i[1]
pool.apply(pdf2text, args=(i[1],))
It takes just as long time? The printing happens as if the processes are run one at a time...
I finally found out a way to run the code in parallel. Unbelievable how much faster it got.
import multiprocessing as mp
jobs = []
for i in process_data:
p = mp.Process(target=download_pdf, args=(i[0], i[1]))
jobs.append(p)
p.start()
for i, data in enumerate(process_data):
print data
p = mp.Process(target=pdf2text, args=(data[1],))
jobs[i].join()
p.start()
here is a great article on how to build stuff in parallel,
it uses multiprocessing.dummy to run things in different threads
here is a little example:
from urllib2 import urlopen
from multiprocessing.dummy import Pool
urls = [url_a,
url_b,
url_c
]
pool = Pool()
res = pool.map(urlopen, urls)
pool.close()
pool.join()
for python >= 3.3 I suggest concurrent.futures
example:
import functools
import urllib.request
import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def load_url(url, timeout):
return urllib.request.urlopen(url, timeout=timeout).read()
with futures.ThreadPoolExecutor(50) as executor:
future_list = executor.run_to_futures(
[functools.partial(load_url, url, 30) for url in URLS])
example taken from: here

Categories

Resources