I want to save my output data into the text file where each new line is shown in a different row. Currently each row is delimited by \n, I want new lines to be saved in different rows.
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
PDF_file = "F:/ABC/Doc_1.pdf"
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = "F:/ABC/intermediate_steps/out_text.txt"
f = open(outfile, "a")
for i in range(1, 2):
filename = "page_"+str(i)+".jpg"
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"\ABC\opencv-text-detection\Tesseract-OCR\tesseract.exe"
from pytesseract import pytesseract
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
#text = text.splitlines()
f.writelines("Data Extracted from next page starts now.")
f.writelines(str(text.encode('utf-8')))
f.close()
For eg :-
ABC
DEF
GHI
Current output :-
ABC\nDEF\nGHI\n
When you do
f.writelines(str(text.encode('utf-8')))
You convert the newline byte \n to its escaped version \\n. You should use just
f.writelines(text)
Related
I am trying to split/extract PDF pages between two strings - excluding pages that contains both strings.
For example,
String1 = "String1"
String2 = "String2"
If page 2 has "String1" and page 10 has "String2", then the output PDF should contain pages from 3 to 9.
The below script extracts all the pages which contains or have the string and creates a single PDF will all the pages containing the string.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import fitz
import re
nameList = list()
directory = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
output = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
with open(os.path.join(directory,file), 'rb') as pdfFileObj: # Changes here
doc = fitz.open(directory+ "\\" + file)
nameList.append(str(file))
docPageCount = doc.page_count
reader = PdfReader(pdfFileObj)
writer = PdfWriter()
pageNo = list()
# Open the pdf file
object = PdfReader(doc.name, 'rb') # Get number of pages
NumPages = docPageCount # Enter code here
String = "REMIT-TO-CODE" # Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
##print("Pattern Found on Page: " + str(i))
pageNo.append(str(i))
minPg = min(pageNo, key=float)
minPg = int(minPg)
for page_num in range(minPg, docPageCount):
page = reader.pages[page_num]
# This is CPU intensive! It ZIPs the contents of the page
page.compress_content_streams()
end = output+"\\"+"PAYMENT_WIRE_STEP_1_2_" + file
writer.add_page(page)
with open(end, "wb") as fh:
writer.remove_links()
writer.write(fh)
I have a function that asks the user for a PDF file and receive the page number the user wish to convert into an image. The function usually works fine however with a few PDFs it does not work, the image that is returned is blank and it has 4 mega bytes. Apparently it has something to do with the size of the file. Is there a way to solve this problem?
from PyPDF2 import PdfFileReader, PdfFileWriter
from tkinter.filedialog import askopenfilename
from pdf2image import convert_from_path
import os
import PIL
PIL.Image.MAX_IMAGE_PIXELS = None
def convert_pdf(page_number):
filename = askopenfilename()
pdf_file_path = filename
file_base_name = pdf_file_path.replace('.pdf', '')
pdf = PdfFileReader(pdf_file_path)
pages = [page_number]
pdfWriter = PdfFileWriter()
for page_num in pages:
pdfWriter.addPage(pdf.getPage(page_num))
with open('{0}_subset.pdf'.format(file_base_name[:-5]), 'wb') as f:
pdfWriter.write(f)
f.close()
n = file_base_name[:-5]
nome = f'{n}_subset.pdf'
pages = convert_from_path(nome, poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
name = os.path.basename(nome).split('/')[-1][:-4]
for page in pages:
image_name = "Page_" + str(i) + f"{name}.jpg"
page.save(image_name, "JPEG")
i = i + 1
The solution to this problem was to change the DPI parameter of convert_from_path function. It is important to leave the DPI as it is, since I found that certain images become really small, and therefore unreadable.
try:
pages = convert_from_path(nome, poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
except:
PIL.Image.MAX_IMAGE_PIXELS = None
pages = convert_from_path(nome, 25,poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
I am trying to extract text from PDF, but when I extract the text some words and numbers are missing.
Is it possible to extract the text without missing words and numbers?
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
path = "file.pdf"
reader = PdfReader(path)
pages = len(reader.pages)
output = open(path.rsplit(".", 1)[0] + ".txt", "a")
for i in range(1, pages + 1):
image = convert_from_path(path, first_page=i, last_page=i)
print(f"Converting page: {i}/{pages}")
say = pytesseract.image_to_string(image[0])
output.write(say + "\n")
output.flush()
print(f"Conversion of {path} Complete")
In a part of my project for creating a dataset, I have a text file containing a list of a bunch of latex equations . Now I want to convert them into images through python in diffrent font sizes. But i dont know how to do it. Please help.
This is the list of latex symbols I am using:- https://docs.mathpix.com/#vocabulary
#
import shutil
import os
from pdflatex import PDFLaTeX
from pdf2image import convert_from_path
from PIL import Image
def crop(file):
img = Image.open(file)
area = (300,300, 800, 800)
cropped_img = img.crop(area)
cropped_img.save(file)
def save_images(images_names,pdf_path,images_path=""):
# Store Pdf with convert_from_path function
images = convert_from_path(pdf_path)
if len(images_names)==0:
print("names is empty")
return
i=0
for img in images:
img.save(images_path+"/"+images_names[i]+".jpg", 'JPEG')
crop(images_path+"/"+images_names[i]+".jpg")
i+=1
print("Successfully converted")
def create_image_from_latex(image_name,latex):
if "rough" not in os.listdir():
os.mkdir("rough")
if "images_from_latex" not in os.listdir():
os.mkdir("images_from_latex")
f=open("rough/a.tex","w+")
f.write("\\documentclass{article}\n\\usepackage{chemfig}\n\\begin{document}\n")
f.write(latex+"\n")
f.write(r"\end{document}")
f.close()
#print(os.getcwd()+"/a.tex")
#tex="/a.tex"
pdfl = PDFLaTeX.from_texfile('rough/a.tex')
pdf, log, completed_process = pdfl.create_pdf(keep_pdf_file=True, keep_log_file=False)
f=open("rough/a.pdf","wb")
f.write(pdf)
f.close()
save_images([image_name],"a.pdf","images_from_latex")
os.remove("rough/a.pdf")
shutil.rmtree("rough")
#create_image_from_latex("new_image",lat)
def create_images_from_text_file_with_latexes(text_file):
with open(text_file) as f:
latexes=f.readlines()
ind=1
for lat in latexes:
create_image_from_latex("%0.3d_"%ind,lat)
ind+=1
#
I'm trying to loop through a list of ~3,000 URLs and create QR codes for them. In one column I have the URLs and in another column I have what I want the QR code file names to be named when output as images.
The problem is the URLs that get converted to QR codes and my file names both come out encased in brackets.
For example:
URL Filename
www.abel.com Abel
Comes out as:
URL in QR Code Filename of QR Code
[www.abel.com] [Abel]
Here's my code so far:
import csv
import qrcode
import pandas as pd
df = pd.read_csv('QR_Python_Test.csv')
i = 1
x = df.iloc[[i]]
print(
x.QR_Code_Name.values)
for i in df.index:
z = df.iloc[[i]]
x = str(z.Link_Short.values)
qr = qrcode.QRCode(version=5, error_correction=qrcode.constants.ERROR_CORRECT_L,box_size=5,border=2,)
qr.add_data(x)
qr.make(fit=True)
img = qr.make_image()
file_name = str(z.QR_Code_Name.values) + ".png"
print('Saving %s' % file_name)
image_file = open(file_name, "w")
img.save(file_name)
image_file.close()
file.close()
And some sample data:
URL Filename
www.apple.com Apple
www.google.com Google
www.microsoft.com Microsoft
www.linux.org Linux
Thank you for your help,
Me
If your DataFrame contains the correct information, you can use DataFrame.itertuples
also separate the functions
reading the data from the file
generating the qr-code
saving the files
That way, you can test each of these individually
def generate_images(df):
for row in df.itertuples():
yield row.Filename, generate_qr(row.URL)
def generate_qr(url):
qr = qrcode.QRCode(version=5, error_correction=qrcode.constants.ERROR_CORRECT_L,box_size=5,border=2,)
qr.add_data(url)
qr.make(fit=True)
return qr.make_image()
def save_qr_code(qr_codes):
for filename, qr_code in qr_codes:
filename = filename + '.png'
print('saving to file %s' % (filename,)
with open(filename, 'wb') as file:
qr_code.save(file)
df = pd.read_csv('my_data.csv')
qr_codes = generate_images(df)
save_qr_code(qr_codes)