Extract numbers and text in Python Pandas

Extract numbers and text in Python Pandas - python

I am new to Python Pandas, need your guidance. I have following below code which extract specific data from a pdf files and export into a excel file. The code is working fine, however all data are exported into text format. Is there any way I can use text and number extract in same code.
import os
import pandas as pd
import numpy as np
import glob
import pdfplumber
def get_keyword(start, end, text):
for i in range(len(start)):
try:
field = ((text.split(start[i]))[1].split(end[i])[0])
return field
except:
continue
def main():
my_dataframe = pd.DataFrame()
for files in glob.glob("C:/PDFs\*.pdf"):
with pdfplumber.open(files) as pdf:
page = pdf.pages[0]
text = page.extract_text()
text = " ".join(text.split())
# obtain keyword #1-Find Supplier-This is text & it is fine
start = ['SUPPLIER ']
end = [' Purchase']
keyword1 = get_keyword(start, end, text)
# obtain keyword #2-Find Invoice, This is number-which need to number not text.
start = ['Invoice Weight(Kg) ']
end = ['.00 Net Weight.(Kg)']
keyword2 = get_keyword(start, end, text)
my_list = [keyword1, keyword2]
my_list = pd.Series(my_list)
my_dataframe = my_dataframe.append(my_list, ignore_index=True)
print("Document's keywords have been extracted successfully!")
my_dataframe = my_dataframe.rename(columns={0:'Supplier',
1:'Invoice Number',
2:'Mill Lot Number'})
save_path: str = 'C:/PDFs'
os.chdir(save_path)
# extract my dataframe to an .xlsx file!
my_dataframe.to_excel('sample.xlsx', sheet_name = 'Sheet1')
print("")
print(my_dataframe)
if __name__ == '__main__':
main()
I tried using #str.extract(r"([A-Za-z\s]+)([\d-]+)"))- but it did not work. I also tried below link, but could not decipher. Kindly help!!
Python pandas extracting numbers and text within text to two new column

Related

to classify the extracted text from images into spam or non spam using python

<**
extract text from all the images in a folder
storing the text in a single file
from PIL import Image
import pytesseract as pt
import pandas as pd
from tabulate import tabulate
from io import StringIO
import os
import json
import csv
def main():
# path for the folder for getting the raw images
path ="E:/mehr mtech p1/images/"
# link to the file in which output needs to be kept
fullTempPath ="E:/mehr mtech p1/out.txt"
# iterating the images inside the folder
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
#print(imageName)
# applying ocr using pytesseract for python
pt.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
text = pt.image_to_string(img, lang ="eng")
#print(text)
dictionary = {'image': imageName, 'Text': text}
print(dictionary)
#Create a datafrmae from the dictionary
df = pd.DataFrame(dictionary, index=[0])
#print dataframe.
#print(df)
#print(tabulate(df, headers = 'keys', tablefmt = 'psql'))
#Creating a string of the dictionary to print the data with labels in string format in the txt file
#string = json.dumps(dictionary)
#f1 = open("E:/mehr mtech p1/mmyfile.txt","a+")
#f1.write(string)
#df = pd.read_csv(string, sep =";")
#print(df)
df.to_csv("E:/mehr mtech p1/tableimage.csv")
# saving the text for appending it to the output.txt file
# a + parameter used for creating the file if not present
# and if present then append the text content
file1 = open(fullTempPath, "a+")
# providing the name of the image
file1.write(imageName+"\n")
# providing the content in the image
file1.write(text+"\n")
file1.close()
# for printing the output file
file2 = open(fullTempPath, 'r')
print(file2.read())
file2.close()
if name == 'main':
main()
**>
the extracted text was converted into a dataframe by first converting it into a dictionary. but while converting that dataframe to a csv file and transferring data to excel file..only 1record i.e., only text of 1 image is coming in csv file..what to do now
the dataframe is coming in this way
the dictionary is coming this way
csv file
error

i cant replace a word in word document with another thing by docx

I want to build a function that converts names from CSV to a document in word by Docx library and put the serial number in specific place in word document and same with date but I ran into a couple of problems such as:
only the name is replaced but not the date and serial number.
can't save the docx document in the file I created
here is my code as I want the three loops of n, l, and m to work together:
import docx
import pandas as pd
from datetime import datetime
import os
from docx2pdf import convert
def auto_fill(x,y):
database=pd.read_csv(x)
df=pd.DataFrame(database)
df=df.dropna(axis=0)
date = datetime.date(datetime.now())
strdate = date.strftime("%m-%d-%Y")
path = strdate
newfile = os.makedirs(path)
for i in range(len(df.Name)):
targeted_doc = docx.Document(y)
for n in range (len(targeted_doc.paragraphs)):
if targeted_doc.paragraphs[n].text=="Name of trainee":
name=targeted_doc.paragraphs[n].runs[0].text=df.at[i,'Name']
for m in range(len(targeted_doc.paragraphs)):
if targeted_doc.paragraphs[m].text == "tissue date":
date = targeted_doc.paragraphs[m].runs[0].text = strdate
for l in range(len(targeted_doc.paragraphs)):
if targeted_doc.paragraphs[l].text == "tserial number":
sr_num = targeted_doc.paragraphs[l].runs[0].text = df.at[i, 'serial number']
name_of_file = (f"{df.at[i, 'Name']}.docx")
outputdoc=targeted_doc.save(name_of_file)
completesave = os.path.join(path, name_of_file)
convert(name_of_file,path)
auto_fill("database.csv","01.docx")

Extract Text into Column using Regex

I want to extract data (S no, Item Code, Price and Size) from the attached PDF Document in to columns.
The re.compile works for the S no, Item Code and Price, but as soon as I put the Size - it gives a limited output. I am unable to figure out why? Can you please help
(Attached picture of the PDF page)
Import pandas as pd
Import re
Import PyPDF2
file = open("Petchem.pdf", "rb")
pdfReader = PyPDF2.PdfFileReader(file)
my_dict = {"S no":[], "Item Code":[], "Price":[], "Size":[]}
for page in range (1,25):
pageObj = pdfReader.getPage(page)
data = pageObj.extractText()
size = re.compile(r'((\d{2,4}?)(\d{10})EA\s(\d?\d?,?\d?\d?\d.\d\d)[\s\w\d,:/.()-])')
for number in size.findall(data):
S_No = my_dict["S No"].append(number[1])
Item_Code = my_dict["Item Code"].append(number[2])
Price = my_dict["Price"].append(number[3])
Size = my_dict["Size"].append(number[4])
print(number[1])
a_file = open("Column_Breakup.csv", "w")
datadf = pd.DataFrame(my_dict)
datadf.to_csv("Column_Breakup.csv")
a_file.close()

Run a python script on all files in a directory

First time posting a question here, hopefully, someone who experienced/tried this please share your insights... I've been working to get this far in the last few days and nights... now I am getting nowhere to loop this script on every file in a directory.
Bascially, these two scripts work perfectly fine it brings a pdf file and changes it to an excel workbook. Now what I need to do is going through all files from a selected directory and do the same job.
I am keep getting stuck at the opening the file stage - is this saying that the data (the pdf page - data[0]) cant be called in? or should i add more stages in to bring the dataset in...?
Do I have to create a list for the dataset so I can call in the data as you would have more than a data to call in.. is this why python can read the data[0] ???
Revised Script
# import
import os
import glob
import pdftotext
import openpyxl
from pathlib import Path
from string import ascii_uppercase
# open a pdf file
def to_excel(pdf_file):
with open(pdf_file,'rb') as f:
data = pdftotext.PDF(f)
# operate data to get titles, values
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# get an output file name
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# create an excel workbook
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# save the excel workbook
excel_file.save(filename)
excel_file.close
# run a python script every file in a directory
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)

I basically know what you want to do, but your code's indent is not so readable... especially it's python.
Your goal is to create a excel for each pdf file in you prefix dir? or aggregate all the pdf files together to a single excel file?
The follow coding is for the first goal.
Code logic.
get all the pdf file
loop over all the pdf file, for each:
open pdf file
some operation
export to excel file
You full code maybe like this(just guess):
# ----------------import part-------------------
import os
import glob
import pdftotext
import openpyxl
from string import ascii_uppercase
from pathlib import Path
def to_excel(pdf_file):
with open(pdf_file, 'rb') as f: # this open the pdf file
data = pdftotext.PDF(f)
# ---------------operate the data, get title and value-----------
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# ------------------get output file name---------------------
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# ------------------create excel file sheet------------------
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# --------------------save----------------
excel_file.save(filename)
excel_file.close
# -------------------main program---------------
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)

QR Code code inserting '[' and ']', how to stop this?

I'm trying to loop through a list of ~3,000 URLs and create QR codes for them. In one column I have the URLs and in another column I have what I want the QR code file names to be named when output as images.
The problem is the URLs that get converted to QR codes and my file names both come out encased in brackets.
For example:
URL Filename
www.abel.com Abel
Comes out as:
URL in QR Code Filename of QR Code
[www.abel.com] [Abel]
Here's my code so far:
import csv
import qrcode
import pandas as pd
df = pd.read_csv('QR_Python_Test.csv')
i = 1
x = df.iloc[[i]]
print(
x.QR_Code_Name.values)
for i in df.index:
z = df.iloc[[i]]
x = str(z.Link_Short.values)
qr = qrcode.QRCode(version=5, error_correction=qrcode.constants.ERROR_CORRECT_L,box_size=5,border=2,)
qr.add_data(x)
qr.make(fit=True)
img = qr.make_image()
file_name = str(z.QR_Code_Name.values) + ".png"
print('Saving %s' % file_name)
image_file = open(file_name, "w")
img.save(file_name)
image_file.close()
file.close()
And some sample data:
URL Filename
www.apple.com Apple
www.google.com Google
www.microsoft.com Microsoft
www.linux.org Linux
Thank you for your help,
Me

If your DataFrame contains the correct information, you can use DataFrame.itertuples
also separate the functions
reading the data from the file
generating the qr-code
saving the files
That way, you can test each of these individually
def generate_images(df):
for row in df.itertuples():
yield row.Filename, generate_qr(row.URL)
def generate_qr(url):
qr = qrcode.QRCode(version=5, error_correction=qrcode.constants.ERROR_CORRECT_L,box_size=5,border=2,)
qr.add_data(url)
qr.make(fit=True)
return qr.make_image()
def save_qr_code(qr_codes):
for filename, qr_code in qr_codes:
filename = filename + '.png'
print('saving to file %s' % (filename,)
with open(filename, 'wb') as file:
qr_code.save(file)
df = pd.read_csv('my_data.csv')
qr_codes = generate_images(df)
save_qr_code(qr_codes)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract numbers and text in Python Pandas - python

Related

to classify the extracted text from images into spam or non spam using python

i cant replace a word in word document with another thing by docx

Extract Text into Column using Regex

Run a python script on all files in a directory

QR Code code inserting '[' and ']', how to stop this?

Categories

Resources