Mapping of two text documents with python - python

I have annotated some textual data and now I am trying to map it with the original text file to get more information out.
I have all information of the annotations in a JSON file, from which I successfully parsed all the relevant information. I stored the information as seen below.
Column = entity class
Column = starting point of the text
Column = length of the text (in char)
Column = value of entity label
Column = actual text that was annotated
My goal now is to include non-annotated text, as well. Not every single sentence or character of a text document has been annotated, but I want to include them to feed all the information into a DL-Algorithm. So every sentence that has not been annotated should be included and showing "None" as of entity class and entity label.
Appreciate any hint or help on that!
Thanks!

The information in your annotation file is not quite accurate. Since you stripped out white spaces, the length of the text should be adjusted properly.
def map_with_text(data_file, ann_file, out_file):
annots = []
# Read annotation information
with open(ann_file, 'r') as file_in:
for line in file_in:
components = line.split('t')
components = line.split("\t")
label = components[0]
begin = int(components[1])
length = int(components[2])
f_4 = int(components[3])
f_5 = int(components[4])
text = components[5].strip()
annots.append((label, begin, length, f_4, f_5, text))
annots = sorted(annots, key=lambda c: c[1])
# Read text data
with open(data_file, 'r') as original:
original_text = original.read()
length_original = len(original_text)
# Get positions of text already annotated. Since it was
# stripped, we cannot use the length. You can modify it if
# you think your information is accurate.
# pos_tup = [(begin, begin+length)
# for _, begin, length, _, _, text in annots]
pos_tup = [(begin, begin+len(text))
for _, begin, length, _, _, text in annots]
# Get position marker
pos_marker = [0] + [e for l in pos_tup for e in l] + [length_original]
# Ranges of positions of text which have not been annotated
not_ann_pos = [(x, y)
for x, y in zip(pos_marker[::2], pos_marker[1::2])]
# Texts which have not been annotated
not_ann_txt = [original_text[start:stop]
for start, stop in not_ann_pos]
# Include it in the list
all_components = [(None, start, len(txt.strip()), None, None, txt.strip())
for start, txt in zip(pos_marker[::2], not_ann_txt) if len(txt.strip()) != 0]
# Add annotated information
all_components += annots
# Sort by the start index
all_components = sorted(all_components, key=lambda c: c[1])
# Write ot the output file
with open(out_file, 'w') as f:
for a in all_components:
f.write(str(a[0]) + "\t" + str(a[1]) + "\t" + str(a[2]) +
"\t" + str(a[3]) + "\t" + str(a[4]) + "\t" + str(a[5]) + "\n")
map_with_text('0.txt', '0.ann', 'out0.tsv')
# You can loop calling the function
#
#

Related

What is spacy spangroup. When and How do we use it? Will it help in overlapping spans?

I've used the code below to create spans using spacy.
from collections import Counter
from spacy.tokens import DocBin
def make_spans(lst):
"""
Split list of numbers into a list of (start,end) tuples,
e.g. [1,2,3,4,10,11,12] of indexes becomes [(1,5), (10,13)] (exclusive end index)
"""
start = None
spans = []
last_d = None
for d in lst:
if last_d is None:
start = d
elif d > last_d + 1:
spans.append( (start, last_d + 1) )
start = d
last_d = d
if start is not None:
spans.append( (start, last_d + 1) )
return spans
def read_data(fname: str):
"""
Read data from CSV file with rows (list(indexes), text)
:param fname: (relative) path of file with CSV data
"""
with open(fname, newline='') as csvfile:
reader = csv.reader(csvfile)
_ = next(reader) # skip the headers
for row in reader:
lst = ast.literal_eval(row[0])
text = row[1]
yield lst, text
def create_docbin(fname: str, basename: str, nlp, span_label='toxic_spans'):
"""Create a DocBin from a CSV with data rows (list(indexes), text)
For each row:
- define a `Doc` object from the `text`
- create `Span` objects with the label 'TOXIC' based on the indexes from the CSV
- add these Span objects to the doc's entities and those to the doc's spans
- add the created doc to the `DocBin` object
"""
doc_bin = DocBin()
for spans, text in read_data(fname):
ms = make_spans(spans)
doc = nlp(text)
span_lst = []
for start, end in ms:
span = doc.char_span(start, end, label='TOXIC')
if span is not None:
span_lst.append(span)
# span_lst is now a list of spaCy `Span` objects
# Set the Spans as document entities
doc.set_ents(span_lst)
# Set the document entities as spans
doc.spans[span_label] = list(doc.ents)
doc_bin.add(doc)
# Save to totality of created documents with their spans in the custom binary `spacy` format
doc_bin.to_disk(f'corpus/{basename}.spacy')
I'm unable to use overlapping spans and it gives me an error when I've 2 labels which is food and ingredient. Food span has longer text while ingredient is just a single word. How can I resolve this? And will Spangroup help in this implementation?
A SpanGroup is basically just a list of Spans - there's a little more to it, but not much. You really shouldn't have to think about the SpanGroups in particular.
Entities on a Doc are specifically defined as a list of non-overlapping spans. If you need overlapping spans, then yes, you can save them in a SpanGroup, which is exactly what your code is already doing - when you assign a list to doc.spans[something], it's converted into a SpanGroup automatically.
If you want to train a model to predict spans like that, you'll need to use a spancat component. You can read more about that in the spancat blogpost.

Extract name of function and arguments in separate text

I am trying to extract words only from the body of the function. Below you can see my text.
# Example of estimation
## Example of estimation
### Example of estimation
"Some calculation"
""" Note :
The data here is artificial.
Idea is to show how code will look like after estimation.
More information www.google.com
"""
#iterate_jit(nopython=True)
def fun_min_ssc(min_wage, tax_rate,calc_min_profit):
calc_min_profit= min_wage * tax_rate + min_wage - (min_wage*2)
return calc_min_profit
Text that starting with : #,##,###,”,""", # is not needed.
Now I want to extract only arguments from the body of the function such as :
Name of the function: fun_min_ssc and
Arguments of the function : min_wage, tax_rate,calc_min_profit
I tried to solve this problem with the function below :
f= open("text.txt","w+")
f.write('''# Example of estimation
## Example of estimation
### Example of estimation
"Some calculation"
""" Note :
The data here is artificial.
Idea is to show how code will look like after estimation.
More information www.google.com
"""
#iterate_jit(nopython=True)
def cal_min_ssc(min_wage, tax_rate,min_profit):
min_profit = min_wage * tax_rate + min_wage - (min_wage*2)
return min_profit
''')
for line in f.readlines():
print(line, end='')
f.close()
os.getcwd()
os.listdir()
os.chdir('C:/') <---Import your path
file_reader = open('C:/text.txt') <----Import your path
os.getcwd()
# Open the file in read mode
text = open("text.txt", "r")
# Creating dictonary and count freqency
d = dict()
# Loop through each line of the file
for line in text:
# Remove the leading spaces and newline character
line = line.strip()
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Split the line into words
words = line.split(" ")
words = line.split(",")
words = line.split("*")
# Iterate over each word in line
for word in words:
# Check if the word is already in dictionary
if word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
# Print the contents of dictionary
for key in list(d.keys()):
print(key, ":", d[key])
So can anybody help me how to solve this problem or suggest some other approach that can solve this problem ?
This might get you on the right track. I have used a regex statement as a specific search criteria to find the lines that start with def and end with :.
x = re.search(r"^def.*:$", line)
Once I have the line in question, I split the line using def and the opening bracket of the function (. This allows me to easily grab the function name.
values = x[0].split('def ')[1].split('(')
function_name = values[0]
I then have to grab the other section, but remove the last two characters ie. ):
arguments = values[1][:-2].split(', ')
As the arguments are separated by a comma, I can then use that as a split separator. However, I must warn you, make sure they are consistently separated in the same way...i.e. with or without a space after the comma.
I have printed the desired output, however, you can add these items to a list or whatever structure you desire:
Here is my example code (without all the file input stuff):
import re
text = '''# Example of estimation
## Example of estimation
### Example of estimation
"Some calculation"
""" Note :
The data here is artificial.
Idea is to show how code will look like after estimation.
More information www.google.com
"""
#iterate_jit(nopython=True)
def cal_min_ssc(min_wage, tax_rate, min_profit):
min_profit = min_wage * tax_rate + min_wage - (min_wage*2)
return min_profit
'''
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x != None:
values = x[0].split('def ')[1].split('(')
function_name = values[0]
arguments = values[1][:-2].split(', ')
print(f"Function Name: {function_name}")
print(f"Arguments: {arguments}")
OUTPUT:
Function Name: cal_min_ssc
Arguments: ['min_wage', 'tax_rate', 'min_profit']

Python find and replace multiple comment lines in array with parsed single line comment

Let's say that we've read a python file with multiple lines of comments and then some code. This is stored in data as a list or np.ndarray
data = ["# this", "# is" "# the first comment", "print('hello world')", "# second comment"]
expected_output = ["```this is the first comment```", "print('hello world')", "``` second comment```"]
expected_output
The desired output will replace the multiple elements starting with a # character with the single parsed comment wrapped in the backtick characters
['```this is the first comment```',
"print('hello world')",
'``` second comment```']
I can do the parsing but I don't know how to replace the individual lines with the newly formatted single lines (e.g. index [0, 1, 2] in the example above).
The script so far:
from pathlib import Path
import numpy as np
from itertools import groupby
from operator import itemgetter
def get_consecutive_group_edges(data: np.ndarray):
# https://stackoverflow.com/a/2154437/9940782
edges = []
for k, g in groupby(enumerate(data),lambda x:x[0]-x[1]):
group = (map(itemgetter(1),g))
group = list(map(int, group))
edges.append((group[0],group[-1]))
# convert ranges into group index
# https://stackoverflow.com/a/952952/9940782
group_lookup = dict(enumerate(edges))
return group_lookup
if __name__ == "__main__":
# https://stackoverflow.com/a/17141572/9940782
filedata = ["# this", "# is" "# the first comment", "print('hello world')", "# second comment"]
# find all consecutive lines starting as comments
comment_lines = np.argwhere([l[0] == "#" for l in filedata])
group_lookup = get_consecutive_group_edges(comment_lines)
output_lines = []
for comment_idx in group_lookup.keys():
# extract the comment groups
min_comment_line = group_lookup[comment_idx][0]
max_comment_line = group_lookup[comment_idx][1] + 1
data = filedata[min_comment_line: max_comment_line]
# remove the comment characters
output = "".join(data).replace("\n", " ").replace("#", "")
# wrap in ```
output = "```" + output + "```" + "\n"
I am failing at the final step: How do I replace all of the values between min_comment_line and max_comment_line for each group with the single, newly parsed output?
Can I do something with the non-commented lines?
non_comment_lines = np.argwhere([l[0] != "#" for l in filedata])
You can assign to a list slice in Python, which can replace multiple elements with one:
...
# make a copy of the original list, so we can replace the comments
output_lines = filedata.copy()
# iterate backwards so the indices line up
for comment_idx in reversed(group_lookup):
# extract the comment groups
min_comment_line = group_lookup[comment_idx][0]
max_comment_line = group_lookup[comment_idx][1] + 1
data = filedata[min_comment_line:max_comment_line]
# remove the comment characters
output = "".join(data).replace("\n", " ").replace("#", "")
# wrap in ```
output = "```" + output + "```"
output_lines[min_comment_line:max_comment_line] = [output]
However, the entire operation can be much simpler, since groupby only groups consecutive matching elements:
output_lines = []
# iterate over consecutive sections of comments and code
for is_comment, lines in groupby(filedata, key=lambda x: x[0] == "#"):
if is_comment:
# remove the comment characters
output = "".join(lines).replace("\n", " ").replace("#", "")
# wrap in ```
output_lines.append("```" + output + "```")
else:
# leave code lines unchanged
output_lines.extend(lines)

How to convert inkml file to an image format

I have dataset consist of inkml files of handwritten texts. I want to convert it to a usable image format to train a CNN. python script would be helpful.
I found a method given below is the source code
def get_traces_data(inkml_file_abs_path):
traces_data = []
tree = ET.parse(inkml_file_abs_path)
root = tree.getroot()
doc_namespace = "{http://www.w3.org/2003/InkML}"
'Stores traces_all with their corresponding id'
traces_all = [{'id': trace_tag.get('id'),
'coords': [[round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord)) \
for axis_coord in coord[1:].split(' ')] if coord.startswith(' ') \
else [round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord)) \
for axis_coord in coord.split(' ')] \
for coord in (trace_tag.text).replace('\n', '').split(',')]} \
for trace_tag in root.findall(doc_namespace + 'trace')]
# print("before sort ", traces_all)
'Sort traces_all list by id to make searching for references faster'
traces_all.sort(key=lambda trace_dict: int(trace_dict['id']))
# print("after sort ", traces_all)
'Always 1st traceGroup is a redundant wrapper'
traceGroupWrapper = root.find(doc_namespace + 'traceGroup')
if traceGroupWrapper is not None:
for traceGroup in traceGroupWrapper.findall(doc_namespace + 'traceGroup'):
label = traceGroup.find(doc_namespace + 'annotation').text
'traces of the current traceGroup'
traces_curr = []
for traceView in traceGroup.findall(doc_namespace + 'traceView'):
'Id reference to specific trace tag corresponding to currently considered label'
traceDataRef = int(traceView.get('traceDataRef'))
'Each trace is represented by a list of coordinates to connect'
single_trace = traces_all[traceDataRef]['coords']
traces_curr.append(single_trace)
traces_data.append({'label': label, 'trace_group': traces_curr})
else:
'Consider Validation data that has no labels'
[traces_data.append({'trace_group': [trace['coords']]}) for trace in traces_all]
return traces_data
You may consider using xml.etree.ElementTree in Python to parse your inkml files and use OpenCV's cv2.line method to connect the points to draw the stroke.

A Solution for Extracting Tabular Data from a PDF file (sort-of)

I had a need to extract tabular data on a large number of pages from many PDF documents. Using the built-in text export capability from within Adobe’s Acrobat Reader was useless – text extracted that way loses the spacial relationships established by the tables. There have been a number of questions raised by others, and many solutions offered for this problem that I had tried, but the results varied between poor and terrible. So I set about to develop my own solution. It’s developed enough (I think) that it’s ready to share here.
I first tried to look at the distribution of text (in terms of their x & y locations on the page) to try and identify where the row and column breaks are located. By using the Python Module ‘pdfminer’, I extracted the text and BoundingBox parameters, sifted through each piece of text and mapped how many pieces of text were on a page for a given x or y value. The idea was to look through the distribution of text (horizontally for row breaks, and vertically for column breaks), and when the density was zero (meaning there was a clear gap across, or up/down, the table), that would identify a row or column break.
The idea does work, but only sometimes. It assumes the table has the same number and alignment of cells vertically and horizontally (a simple grid), and that there is a distinct gap between the text of adjacent cells. Also, if there’s text that spans across multiple columns (like a title above the table, a footer below the table, merged cells, etc.), identification of column breaks is more difficult – you might be able to identify which text elements above or below the table should be ignored, but I couldn’t find a good approach for dealing with merged cells.
When it came time to look horizontally to identify row breaks, there were several other challenges. First, pdfminer automatically tries to group pieces of text that are located near each other, even when they span more that one cell in the table. In those instances, the BoundingBox for that text object includes multiple lines, obscuring any row breaks that might have been crossed. Even if every line of text were extracted separately, the challenge would be to distinguish what was a normal space separating consecutive lines of text, and what was a row break.
After exploring various work-arounds and conducting a number of tests, I decided to step back and try another approach.
The tables that had the data I needed to extract all have borders around them, so I reasoned I should be able to find the elements in the PDF file that draws those lines. However, when I looked at the elements that I could extract from the source file, I got some surprising results.
You would think that lines would be represented as a “line object”, but you’d be wrong (at least for the files I was looking at). If they aren’t “lines”, then maybe they simply draw rectangles for each cell, adjusting the linewidth attribute to get the line thickness they wanted, right? No. It turned out the lines were actually drawn as “rectangle objects” with a very small dimension (narrow width to create vertical lines, or short height to create horizontal lines). And where it looks like the lines meet at the corners, the rectangles don’t – they have a very small rectangle to fill-in the gaps.
Once I was able to recognize what to look for, I then had to contend with multiple rectangles placed adjacent to one another to create thick lines. Ultimately, I wrote a routine to group similar values and calculate an average value to use for the row and column breaks that I would use later.
Now, it was a matter of processing the text from the table. I chose to use an SQLite database to store, analyze, and regroup the text from the PDF file. I know there are other “pythonic” options out there, and some may find those approaches more familiar and easy to use, but I felt that the amount of data I would be dealing with would be best handled using an actual database file.
As I mentioned earlier, pdfminer groups text that is located near one another, and it may cross cell boundaries. An initial attempt to split pieces of text shown on separate lines within one of these text groups was only partially successful; it’s one of the areas that I intend to develop further (namely, how to bypass the pdfminer LTTextbox routine so I can get the pieces individually).
There is another shortcoming of the pdfminer module when it comes to vertical text. I have been unable to identify any attribute that will either identify when the text is vertical, or what angle (e.g., +90 or -90 degrees) the text is displayed. And the text grouping routine doesn’t seem to know that either, since text rotated +90 degrees (i.e., rotated CCW where the letters are read from the bottom up), it concatenates the letter in reverse order separated by newline characters.
The routine below works fairly well, under the circumstances. I know it’s still rough, there are several enhancements to be made, and it’s not packaged in a way that’s ready for widespread distribution, but it seems to have “broken the code” on how to extract tabular data from a PDF file (for the most part). Hopefully, others may be able to use this for their own purposes, and maybe even improve it.
I welcome any ideas, suggestions, or recommendations that you may have.
EDIT: I posted a revised version that includes additional parameters (cell_htol_up, etc.) to help "tune" the algorithm as to which pieces of text belong to a particular cell in the table.
# This was written for use w/Python 2. Use w/Python 3 hasn't been tested & proper execution is not guaranteed.
import os # Library of Operating System routines
import sys # Library of System routines
import sqlite3 # Library of SQLite dB routines
import re # Library for Regular Expressions
import csv # Library to output as Comma Separated Values
import codecs # Library of text Codec types
import cStringIO # Library of String manipulation routines
from pdfminer.pdfparser import PDFParser # Library of PDF text extraction routines
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTLine, LTRect, LTTextBoxVertical
from pdfminer.converter import PDFPageAggregator
########################################################################################################################
def add_new_value (new_value, list_values=[]):
# Used to exclude duplicate values in a list
not_in_list = True
for list_value in list_values:
# if list_value == new_value:
if abs(list_value - new_value) < 1:
not_in_list = False
if not_in_list:
list_values.append(new_value)
return list_values
########################################################################################################################
def condense_list (list_values, grp_tolerance = 1):
# Group values & eliminate duplicate/close values
tmp_list = []
for n, list_value in enumerate(list_values):
if sum(1 for val in tmp_list if abs(val - list_values[n]) < grp_tolerance) == 0:
tmp_val = sum(list_values[n] for val in list_values if abs(val - list_values[n]) < grp_tolerance) / \
sum(1 for val in list_values if abs(val - list_values[n]) < grp_tolerance)
tmp_list.append(int(round(tmp_val)))
return tmp_list
########################################################################################################################
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, quotechar = '"', quoting=csv.QUOTE_ALL, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
########################################################################################################################
# In case a connection to the database can't be created, set 'conn' to 'None'
conn = None
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Define variables for use later on
#_______________________________________________________________________________________________________________________
sqlite_file = "pdf_table_text.sqlite" # Name of the sqlite database file
brk_tol = 3 # Tolerance for grouping LTRect values as line break points
# *** This may require tuning to get optimal results ***
cell_htol_lf = -2 # Horizontal & Vertical tolerances (up/down/left/right)
cell_htol_rt = 2 # for over-scanning table cell bounding boxes
cell_vtol_up = 8 # i.e., how far outside cell bounds to look for text to include
cell_vtol_dn = 0 # *** This may require tuning to get optimal results ***
replace_newlines = True # Switch for replacing newline codes (\n) with spaces
replace_multspaces = True # Switch for replacing multiple spaces with a single space
# txt_concat_str = "' '" # Concatenate cell data with a single space
txt_concat_str = "char(10)" # Concatenate cell data with a line feed
#=======================================================================================================================
# Default values for sample input & output files (path, filename, pagelist, etc.)
filepath = "" # Path of the source PDF file (default = current folder)
srcfile = "" # Name of the source PDF file (quit if left blank)
pagelist = [1, ] # Pages to extract table data (Make an interactive input?)
# --> THIS MUST BE IN THE FORM OF A LIST OR TUPLE!
#=======================================================================================================================
# Impose required conditions & abort execution if they're not met
# Should check if files are locked: sqlite database, input & output files, etc.
if filepath + srcfile == "" or pagelist == None:
print "Source file not specified and/or page list is blank! Execution aborted!"
sys.exit()
dmp_pdf_data = "pdf_data.csv"
dmp_tbl_data = "tbl_data.csv"
destfile = srcfile[:-3]+"csv"
#=======================================================================================================================
# First test to see if this file already exists & delete it if it does
if os.path.isfile(sqlite_file):
os.remove(sqlite_file)
#=======================================================================================================================
try:
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Open or Create the SQLite database file
#___________________________________________________________________________________________________________________
print "-" * 120
print "Creating SQLite Database & working tables ..."
# Connecting to the database file
conn = sqlite3.connect(sqlite_file)
curs = conn.cursor()
qry_create_table = "CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)"
qry_alter_add_column = "ALTER TABLE {0} ADD COLUMN {1}"
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Create 1st Table
#___________________________________________________________________________________________________________________
tbl_pdf_elements = "tbl_pdf_elements" # Name of the 1st table to be created
new_field = "idx" # Name of the index column
field_type = "INTEGER" # Column data type
# Delete the table if it exists so old data is cleared out
curs.execute("DROP TABLE IF EXISTS " + tbl_pdf_elements)
# Create output table for PDF text w/1 column (index) & set it as PRIMARY KEY
curs.execute(qry_create_table.format(tn=tbl_pdf_elements, nf=new_field, ft=field_type))
# Table fields: index, text_string, pg, x0, y0, x1, y1, orient
cols = ("'pdf_text' TEXT",
"'pg' INTEGER",
"'x0' INTEGER",
"'y0' INTEGER",
"'x1' INTEGER",
"'y1' INTEGER",
"'orient' INTEGER")
# Add other columns
for col in cols:
curs.execute(qry_alter_add_column.format(tbl_pdf_elements, col))
# Committing changes to the database file
conn.commit()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Create 2nd Table
#___________________________________________________________________________________________________________________
tbl_table_data = "tbl_table_data" # Name of the 2nd table to be created
new_field = "idx" # Name of the index column
field_type = "INTEGER" # Column data type
# Delete the table if it exists so old data is cleared out
curs.execute("DROP TABLE IF EXISTS " + tbl_table_data)
# Create output table for Table Data w/1 column (index) & set it as PRIMARY KEY
curs.execute(qry_create_table.format(tn=tbl_table_data, nf=new_field, ft=field_type))
# Table fields: index, text_string, pg, row, column
cols = ("'tbl_text' TEXT",
"'pg' INTEGER",
"'row' INTEGER",
"'col' INTEGER")
# Add other columns
for col in cols:
curs.execute(qry_alter_add_column.format(tbl_table_data, col))
# Committing changes to the database file
conn.commit()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Start PDF text extraction code here
#___________________________________________________________________________________________________________________
print "Opening PDF file & preparing for text extraction:"
print " -- " + filepath + srcfile
# Open a PDF file.
fp = open(filepath + srcfile, "rb")
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization (if needed)
# document = PDFDocument(parser, password)
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Extract text & location data from PDF file (examine & process only pages in the page list)
#___________________________________________________________________________________________________________________
# Initialize variables
idx1 = 0
idx2 = 0
lastpg = max(pagelist)
print "Starting text extraction ..."
qry_insert_pdf_txt = "INSERT INTO " + tbl_pdf_elements + " VALUES(?, ?, ?, ?, ?, ?, ?, ?)"
qry_get_pdf_txt = "SELECT group_concat(pdf_text, " + txt_concat_str + \
") FROM {0} WHERE pg=={1} AND x0>={2} AND x1<={3} AND y0>={4} AND y1<={5} ORDER BY y0 DESC, x0 ASC;"
qry_insert_tbl_data = "INSERT INTO " + tbl_table_data + " VALUES(?, ?, ?, ?, ?)"
# Process each page contained in the document.
for i, page in enumerate(PDFPage.create_pages(document)):
interpreter.process_page(page)
# Get the LTPage object for the page.
lt_objs = device.get_result()
pg = device.pageno - 1 # Must subtract 1 to correct 'pageno'
# Exit the loop if past last page to parse
if pg > lastpg:
break
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# If it finds a page in the pagelist, process the contents
if pg in pagelist:
print "- Processing page {0} ...".format(pg)
xbreaks = []
ybreaks = []
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Iterate thru list of pdf layout elements (LT* objects) then capture the text & attributes of each
for lt_obj in lt_objs:
# Examine LT objects & get parameters for text strings
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
# Increment index
idx1 += 1
# Assign PDF LTText object parameters to variables
pdftext = lt_obj.get_text() # Need to convert escape codes & unicode characters!
pdftext = pdftext.strip() # Remove leading & trailing whitespaces
# Save integer bounding box coordinates: round down # start, round up # end
# (x0, y0, x1, y1) = lt_obj.bbox
x0 = int(lt_obj.bbox[0])
y0 = int(lt_obj.bbox[1])
x1 = int(lt_obj.bbox[2] + 1)
y1 = int(lt_obj.bbox[3] + 1)
orient = 0 # What attribute gets this value?
#---- These approaches don't work for identifying vertical text ... --------------------------------
# orient = lt_obj.rotate
# orient = lt_obj.char_disp
# if lt_obj.get_writing_mode == "tb-rl":
# orient = 90
# if isinstance(lt_obj, LTTextBoxVertical): # vs LTTextBoxHorizontal
# orient = 90
# if LAParams(lt_obj).detect_vertical:
# orient = 90
#---------------------------------------------------------------------------------------------------
# Split text strings at line feeds
if "\n" in pdftext:
substrs = pdftext.split("\n")
lineheight = (y1-y0) / (len(substrs) + 1)
# y1 = y0 + lineheight
y0 = y1 - lineheight
for substr in substrs:
substr = substr.strip() # Remove leading & trailing whitespaces
if substr != "":
# Insert values into tuple for uploading into dB
pdf_txt_export = [(idx1, substr, pg, x0, y0, x1, y1, orient)]
# Insert values into dB
curs.executemany(qry_insert_pdf_txt, pdf_txt_export)
conn.commit()
idx1 += 1
# y0 = y1
# y1 = y0 + lineheight
y1 = y0
y0 = y1 - lineheight
else:
# Insert values into tuple for uploading into dB
pdf_txt_export = [(idx1, pdftext, pg, x0, y0, x1, y1, orient)]
# Insert values into dB
curs.executemany(qry_insert_pdf_txt, pdf_txt_export)
conn.commit()
elif isinstance(lt_obj, LTLine):
# LTLine - Lines drawn to define tables
pass
elif isinstance(lt_obj, LTRect):
# LTRect - Borders drawn to define tables
# Grab the lt_obj.bbox values
x0 = round(lt_obj.bbox[0], 2)
y0 = round(lt_obj.bbox[1], 2)
x1 = round(lt_obj.bbox[2], 2)
y1 = round(lt_obj.bbox[3], 2)
xmid = round((x0 + x1) / 2, 2)
ymid = round((y0 + y1) / 2, 2)
# rectline = lt_obj.linewidth
# If width less than tolerance, assume it's used as a vertical line
if (x1 - x0) < brk_tol: # Vertical Line or Corner
xbreaks = add_new_value(xmid, xbreaks)
# If height less than tolerance, assume it's used as a horizontal line
if (y1 - y0) < brk_tol: # Horizontal Line or Corner
ybreaks = add_new_value(ymid, ybreaks)
elif isinstance(lt_obj, LTImage):
# An image, so do nothing
pass
elif isinstance(lt_obj, LTFigure):
# LTFigure objects are containers for other LT* objects which shouldn't matter, so do nothing
pass
col_breaks = condense_list(xbreaks, brk_tol) # Group similar values & eliminate duplicates
row_breaks = condense_list(ybreaks, brk_tol)
col_breaks.sort()
row_breaks.sort()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Regroup the text into table 'cells'
#___________________________________________________________________________________________________________
print " -- Text extraction complete. Grouping data for table ..."
row_break_prev = 0
col_break_prev = 0
table_data = []
table_rows = len(row_breaks)
for i, row_break in enumerate(row_breaks):
if row_break_prev == 0: # Skip the rest the first time thru
row_break_prev = row_break
else:
for j, col_break in enumerate(col_breaks):
if col_break_prev == 0: # Skip query the first time thru
col_break_prev = col_break
else:
# Run query to get all text within cell lines (+/- htol & vtol values)
curs.execute(qry_get_pdf_txt.format(tbl_pdf_elements, pg, col_break_prev + cell_htol_lf, \
col_break + cell_htol_rt, row_break_prev + cell_vtol_dn, row_break + cell_vtol_up))
rows = curs.fetchall() # Retrieve all rows
for row in rows:
if row[0] != None: # Skip null results
idx2 += 1
table_text = row[0]
if replace_newlines: # Option - Replace newline codes (\n) with spaces
table_text = table_text.replace("\n", " ")
if replace_multspaces: # Option - Replace multiple spaces w/single space
table_text = re.sub(" +", " ", table_text)
table_data.append([idx2, table_text, pg, table_rows - i, j])
col_break_prev = col_break
row_break_prev = row_break
curs.executemany(qry_insert_tbl_data, table_data)
conn.commit()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Export the regrouped table data:
# Determine the number of columns needed for the output file
# -- Should the data be extracted all at once or one page at a time?
print "Saving exported table data ..."
qry_col_count = "SELECT MIN([col]) AS colmin, MAX([col]) AS colmax, MIN([row]) AS rowmin, MAX([row]) AS rowmax, " + \
"COUNT([row]) AS rowttl FROM [{0}] WHERE [pg] = {1} AND [tbl_text]!=' ';"
qry_sql_export = "SELECT * FROM [{0}] WHERE [pg] = {1} AND [row] = {2} AND [tbl_text]!=' ' ORDER BY [col];"
f = open(filepath + destfile, "wb")
writer = UnicodeWriter(f)
for pg in pagelist:
curs.execute(qry_col_count.format(tbl_table_data, pg))
rows = curs.fetchall()
if len(rows) > 1:
print "Error retrieving row & column counts! More that one record returned!"
print " -- ", qry_col_count.format(tbl_table_data, pg)
print rows
sys.exit()
for row in rows:
(col_min, col_max, row_min, row_max, row_ttl) = row
# Insert a page separator
writer.writerow(["Data for Page {0}:".format(pg), ])
if row_ttl == 0:
writer.writerow(["Unable to export text from PDF file. No table structure found.", ])
else:
k = 0
for j in range(row_min, row_max + 1):
curs.execute(qry_sql_export.format(tbl_table_data, pg, j))
rows = curs.fetchall()
if rows == None: # No records match the given criteria
pass
else:
i = 1
k += 1
column_data = [k, ] # 1st column as an Index
for row in rows:
(idx, tbl_text, pg_num, row_num, col_num) = row
if pg_num != pg: # Exit the loop if Page # doesn't match
break
while i < col_num:
column_data.append("")
i += 1
if i >= col_num or i == col_max: break
column_data.append(unicode(tbl_text))
i += 1
writer.writerow(column_data)
f.close()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Dump the SQLite regrouped data (for error checking):
print "Dumping SQLite table of regrouped (table) text ..."
qry_sql_export = "SELECT * FROM [{0}] WHERE [tbl_text]!=' ' ORDER BY [pg], [row], [col];"
curs.execute(qry_sql_export.format(tbl_table_data))
rows = curs.fetchall()
# Output data with Unicode intact as CSV
with open(dmp_tbl_data, "wb") as f:
writer = UnicodeWriter(f)
writer.writerow(["idx", "tbl_text", "pg", "row", "col"])
writer.writerows(rows)
f.close()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Dump the SQLite temporary PDF text data (for error checking):
print "Dumping SQLite table of extracted PDF text ..."
qry_sql_export = "SELECT * FROM [{0}] WHERE [pdf_text]!=' ' ORDER BY pg, y0 DESC, x0 ASC;"
curs.execute(qry_sql_export.format(tbl_pdf_elements))
rows = curs.fetchall()
# Output data with Unicode intact as CSV
with open(dmp_pdf_data, "wb") as f:
writer = UnicodeWriter(f)
writer.writerow(["idx", "pdf_text", "pg", "x0", "y0", "x1", "y2", "orient"])
writer.writerows(rows)
f.close()
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
print "Conversion complete."
print "-" * 120
except sqlite3.Error, e:
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Rollback the last database transaction if the connection fails
#___________________________________________________________________________________________________________________
if conn:
conn.rollback()
print "Error '{0}':".format(e.args[0])
sys.exit(1)
finally:
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Close the connection to the database file
#___________________________________________________________________________________________________________________
if conn:
conn.close()

Categories

Resources