Reportlab balanced cols control split of flowables - python

I am using BalancedColumns to generate multiple column layout.
I am not sure, how to resolve an issue of the split happening of the Flowables across the column frames.
I have a heading and it's content. I don't want BalancedColumns to split the flowables in such a way that heading is part of one column and its content is part of another.
The content of the paragraph can split.
The basic python code:
from reportlab.platypus.flowables import BalancedColumns, CondPageBreak
from reportlab.platypus import BaseDocTemplate, Paragraph, Spacer, Frame, PageTemplate, PageBreak
from reportlab.lib.units import mm
from reportlab.lib.pagesizes import A4, LETTER
import os
from reportlab.lib.colors import HexColor
framewidth = A4[0] - 10*mm
frameheight = A4[1] - 20*mm
portrait_frame = Frame(5*mm, 10*mm, framewidth, frameheight,
leftPadding=0,
bottomPadding=0,
rightPadding=0,
topPadding=0,
id=0,
showBoundary=False )
pTemplate = PageTemplate(id=0,frames=[portrait_frame])
templates = [pTemplate]
pdfPath = os.path.abspath(os.path.join(os.path.dirname(__file__), 'balancedColTest.pdf'))
doc = BaseDocTemplate(pdfPath, pagesize=A4, rightMargin=2*mm, leftMargin=2*mm,topMargin=5*mm,bottomMargin=5*mm,showBoundary=0)
doc.addPageTemplates(templates)
# generate stories for balanced columns
story = []
minPadding = 2
for i in range(3):
fs = []
numOfFlowables = random.choice([2, 5, 6, 1, 3])
padding = minPadding + 5*i
for ii in range(numOfFlowables):
text = '<b> <font color="#77D179"> Heading </font></b> <br/>'
heading = Paragraph(text)
heading.keepWithNext = True
fs.append(heading)
fs.append(Paragraph("This is another text in new Para flowable."))
# numCols = 2 if i%2 == 0 else 3
numCols = 2
bCols = BalancedColumns(fs, nCols=numCols, spaceAfter=5*mm, vLinesStrokeColor=HexColor('#77D179'), vLinesStrokeWidth=0.5)
story.append(bCols)
doc.build(story)
Even if I try to use keepWithNext=True the flowables are not part of same frame.
If I use KeepTogether, the BalanceColumns takes the entire space of the page.
heading = Paragraph(text)
content = Paragraph("This is another text in the paragraph")
f= KeepTogether([heading, content])
fs.append(f)
Can anyone suggest a solution so that the heading and its underlying paragraph can remain in the same frame?

Related

How to use MagickImage/Wand to create an image comprised of equally spaced bordered boxes of text in Python

I need to make an image that looks like the following:
To do so, I've implemented the use of MagickImage/Wand. Here is my current implementation
import re
from unicodedata import normalize
from docx import Document
from wand.image import Image
from wand.drawing import Drawing
from wand.font import Font
doc = Document("P.docx")
docText = []
for para in doc.paragraphs:
docText.append(para.text)
fullText = "\n".join(docText)
ct = 242
def get(source, begin, end):
try:
start = source.index(len(begin)) + len(begin)
finish = source.index(len(end), len(start))
return source[start:finish]
except ValueError:
return ""
def capitalize(string):
cap = ("".join(j[0].upper() + j[1:]) for j in string)
return cap
def find_matches(text):
return capitalize(
[
m
for m in re.findall(
r"^[^0-9]\s+([^.;]+\s*)+[.;]+", normalize("NFKD", text), re.MULTILINE
)
]
)
with Image(width=300, height=300, psuedo='xc:black') as canvas:
left, top, width, height = 50, 10, 100, 150
for match in find_matches(text=fullText):
ct += 1
match_words = match.split(" ")
match = " ".join(match_words[:-1])
with Drawing() as context:
context.fill_color = 'white'
context.rectangle(left=left, top=top, width=width, height=height)
canvas.font = Font('/System/Library/Fonts/arial.ttf')
context(canvas)
canvas.caption(match + '\r' + 'ct', left=left, top=top, width=width, height=height, gravity='center')
canvas.save(filename='patdrawTest.png')
I'm not quite certain on how to create borders or how to properly space things with this tool, and as such, this is my current output:
I understand I need to have a base image that is iterated over. I also understand that I will need flags in order to keep track of the height/width/etc. of the previous blocks of text (unless there is an easier way of doing so with this tool). However, the way my code currently works is that it takes in words from a word document, parses it to get specific matches, and then is supposed to put it into an image like the first image I showed above. Yet, I am at a loss. Any help would be greatly appreciated.
Here's the code I've come up with in order to make equally-spaced boxes of text.
import re
from unicodedata import normalize
from docx import Document
from wand.image import Image
from wand.drawing import Drawing
from wand.font import Font
doc = Document("P.docx")
docText = []
for para in doc.paragraphs:
docText.append(para.text)
fullText = "\n".join(docText)
ct = 242
def get(source, begin, end):
try:
start = source.index(len(begin)) + len(begin)
finish = source.index(len(end), len(start))
return source[start:finish]
except ValueError:
return ""
def capitalize(string):
cap = ("".join(j[0].upper() + j[1:]) for j in string)
return cap
def find_matches(text):
return capitalize(
[
m
for m in re.findall(
r"^[^0-9]\s+([^.;]+\s*)+[.;]+", normalize("NFKD", text), re.MULTILINE
)
]
)
with Image(width=400, height=1000, pseudo='xc:white') as canvas:
left, top, width, height = 2, 2, 395, 131
for match in find_matches(text=fullText):
ct += 1
match_words = match.split(" ")
match = " ".join(match_words[:-1])
with Drawing() as context:
context.fill_color = 'black'
context.rectangle(left=left, top=top, width=width, height=height)
context.fill_color = 'white'
context.rectangle(left=(left+2), top=(top+2), width=(width-4), height=(height-4))
canvas.font = Font('/System/Library/Fonts/timesnewroman.ttf')
context(canvas)
canvas.caption(match + '\n' + str(ct), left=(left+5), top=top, width=(width-10), height=height,
gravity='center')
top += 135
canvas.crop(bottom=top)
canvas.save(filename='patdrawTest.png')
Here is the output with this code:
I do, however, still have something I'd like to address. While the boxes of text are all equally-spaced and look rather nice, I'd still prefer that all of the text looks the same; that is the same font-size, and the only way to do that is to have the borders and such be automatically re-sized such that it can work that way. I have no clue on how to do this, but for now here is this, should anyone else run into something like this.

Set data labels text frame wrap to false – python-pptx

I am asking a duplicate of this question, except that the answer submitted does not work for me. I would like to toggle the data_labels' "Wrap text in shape" button from the powerpoint UI via python-pptx. The linked answer ends up removing the data labels altogether instead. I am using the latest python-pptx version (0.6.18).
Here is a simple example to replicate:
from pptx import Presentation
from pptx.chart.data import ChartData
from pptx.enum.chart import XL_CHART_TYPE
from pptx.util import Cm
from pptx.text.text import TextFrame
# create presentation with 1 slide ------
prs = Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[5])
x = ['one','two','three', 'four']
y = [['diff',
[1,
2,
3,
4,
]],
]
specs = {
'height': Cm(7.82),
'width': Cm(14.8),
'left': Cm(2.53),
'top': Cm(5.72)}
data = ChartData()
data.categories = x
data.add_series('diff', [j for j in y[0][1]])
frame = slide.shapes.add_chart(
XL_CHART_TYPE.BAR_CLUSTERED, specs['left'], specs['top'],
specs['width'], specs['height'], data
)
plot = frame.chart.plots[0]
plot.has_data_labels = True
data_labels = plot.series[0].data_labels
dLbls = data_labels._element
# ---use its <c:txPr> child to create TextFrame object---
text_frame = TextFrame(dLbls.get_or_add_txPr(), None)
# ---turn off word-wrap in the usual way---
text_frame.wrap = False
prs.save('chart-01.pptx')
I believe the second to last line should be text_frame.word_wrap = False, not .wrap; that's my mistake on the earlier answer (now fixed).
Also change this line:
data_labels = plot.series[0].data_labels
to:
data_labels = plot.data_labels
And I think you'll get what you're looking for.

Python use pptx read image files in folder place in ppt slide

Need some help.
I'm writing the code to get file name from List and use those file name to look in folder for get the images place on PowerPoint slide. The purpose is I would like to add three image on same slide. So, every slide will have 3 images and so on...
Let say...
Slide1 : place image file name aaaa-1.jpg, aaaa-2.jpg, aaaa-3.jpg
Slide2 : Place image file name bbbb-1.jpg, bbbb-2.jpg, bbbb-3.jpg
... and so on until end of data in list
The file name that keep in list look like this ..list =['aaaa-1.jpg', 'aaaa-2.jpg', 'aaaa-3.jpg', 'bbb-1.jpg, bbbb-2.jpg', 'bbbb-3.jpg' ...]
I use function to send files name to pptx creating module but it doesn't work. After run this code, they build 3 images (with same file) in one slide!
Could you kindly please advise.
Thanks for all answers from your guy.
Here is my code
from pptx import Presentation
from pptx.util import Inches
import os
prs = Presentation()
prs.slide_height=Inches(9)
prs.slide_width=Inches(16)
def buildafm(f1,f2,f3): #pass f1, f2, f3 to function
blank_slide_layout = prs.slide_layouts[6]
slide = prs.slides.add_slide(blank_slide_layout)
os.chdir(r"C:\Python38-32\faprojects\folders\hzt")
left = top = Inches(1)
height = Inches(3.5)
width = Inches(3.5)
pic = slide.shapes.add_picture(f1, left, top, width=width, height=height)
left = Inches(5)
top = Inches(1)
height = Inches(3.5)
width = Inches(3.5)
pic = slide.shapes.add_picture(f2, left, top, width=width, height=height)
left = Inches(9)
top = Inches(1)
height = Inches(3.5)
width = Inches(5.5)
pic = slide.shapes.add_picture(f3, left, top, width=width, height=height)
os.chdir(r"C:\Python38-32\faprojects\folders\hzt")
a = os.listdir(os.getcwd())
# Image files name in folder HZT
#a=['aaaaaaaaaaa.001.jpg','aaaaaaaaaaaa.002.jpg','bbbbbbbbbb.001.jpg','bbbbbbbbbb.002.jpg','cccccccccc.001.jpg','cccccccccc.002.jpg']
newList = [string[:10] for string in a]
print("here is new list",newList)
no_dupes = [x for n, x in enumerate(newList) if x not in newList[:n]]
print("here is new list",no_dupes)
print("Number of heads",len(no_dupes))
hd=no_dupes[0:1]
str1=""
hd=str1.join(hd)
print(hd)
res = list(filter(lambda x: hd in x, a))
print("file is",res)
'''f1 = res[0:1]
f2 = res[1:2]
f3 = res[2:3]'''
for t in res:
print(t)
#showdata(t)
buildafm(t,t,t) #call function to create pptx
prs.save('testbat.pptx')
os.startfile("testbat.pptx")

Trying to parse Word Documents and getting PdfReadError: EOF marker not found

I am testing some Python code to loop through resumes, open each, parse each, and create a comprehensive report based on the contents of each resume. Here is the code that I am running.
#importing all required libraries
import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import pandas as pd
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.matcher import PhraseMatcher
#Function to read resumes from the folder one by one
mypath='C:\\path_to_resumes\\' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
def pdfextract(file):
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
countpage = fileReader.getNumPages()
count = 0
text = []
while count < countpage:
pageObj = fileReader.getPage(count)
count +=1
t = pageObj.extractText()
print (t)
text.append(t)
return text
#function to read resume ends
#function that does phrase matching and builds a candidate profile
def create_profile(file):
text = pdfextract(file)
text = str(text)
text = text.replace("\\n", "")
text = text.lower()
#below is the csv where we have all the keywords, you can customize your own
keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('Stats', None, *stats_words)
matcher.add('NLP', None, *NLP_words)
matcher.add('ML', None, *ML_words)
matcher.add('DL', None, *DL_words)
matcher.add('R', None, *R_words)
matcher.add('Python', None, *python_words)
matcher.add('DE', None, *Data_Engineering_words)
doc = nlp(text)
d = []
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR'
span = doc[start : end] # get the matched slice of the doc
d.append((rule_id, span.text))
keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
## convertimg string of keywords to dataframe
df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1)
df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
base = os.path.basename(file)
filename = os.path.splitext(base)[0]
name = filename.split('_')
name2 = name[0]
name2 = name2.lower()
## converting str to dataframe
name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
return(dataf)
#function ends
#code to execute/call the above functions
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
#code to count words under each category and visulaize it through Matplotlib
final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack()
final_database2.reset_index(inplace = True)
final_database2.fillna(0,inplace=True)
new_data = final_database2.iloc[:,1:]
new_data.index = final_database2['Candidate Name']
#execute the below line if you want to see the candidate profile in a csv format
#sample2=new_data.to_csv('sample.csv')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True)
labels = []
for j in new_data.columns:
for i in new_data.index:
label = str(j)+": " + str(new_data.loc[i][j])
labels.append(label)
patches = ax.patches
for label, rect in zip(labels, patches):
width = rect.get_width()
if width > 0:
x = rect.get_x()
y = rect.get_y()
height = rect.get_height()
ax.text(x + width/2., y + height/2., label, ha='center', va='center')
plt.show()
In the folder, I have '.doc' and '.docx' files. Everything seems to work fine, up until this point, directly below. When I get here, the code throws an error. Here is the troublesome code. The weird thing is, that it looks like some kind of PDF error, but I'm iterating only through '.doc' and '.docx' files.
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
Here is the StackTrace:
Traceback (most recent call last):
File "<ipython-input-2-c63fca79d39f>", line 5, in <module>
dat = create_profile(file)
File "<ipython-input-1-cdc3bf75cd26>", line 34, in create_profile
text = pdfextract(file)
File "<ipython-input-1-cdc3bf75cd26>", line 17, in pdfextract
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1696, in read
raise utils.PdfReadError("EOF marker not found")
PdfReadError: EOF marker not found
The code comes from here.
https://towardsdatascience.com/do-the-keywords-in-your-resume-aptly-represent-what-type-of-data-scientist-you-are-59134105ba0d
You are using package PyPDF2, which is used to read and manipulate pdf files. In the article from towardsdatascience that you mentioned all resumes that author was working on were in pdf format.
Maybe if your resumes are in doc/docx format you should explore python-docx library:
https://python-docx.readthedocs.io/en/latest/index.html

Generating a PDF Index with ReportLab

I'm generating PDF files through ReportLab, but i can't find any documentation on how to generate an index linking to the file pages. Does ReportLab support this type of feature, or is there any other solution ?
After a lot of time searching i came up with an anchor solution. Although i feel it was not the perfect solution for me, i hope it helps someone in need.
from reportlab.lib.styles import ParagraphStyle
from reportlab.pdfbase.pdfmetrics import registerFont
from reportlab.platypus import Paragraph, PageBreak, SimpleDocTemplate, Spacer
registerFont(TTFont('Calibri', 'Calibri.ttf')) # Just some font imports
registerFont(TTFont('Calibri-Bold', 'calibrib.ttf'))
pH = ParagraphStyle(name = 'Header', fontName = 'Calibri-Bold', fontSize = 13, leftIndent = 20, firstLineIndent = -20, spaceBefore = 10, leading = 16)
sH = ParagraphStyle(name = 'SubHeader', fontName = 'Calibri', fontSize = 12, leftIndent = 40, firstLineIndent = -20, spaceBefore = 5, leading = 16)
doc = SimpleDocTemplate('Reports\\PDFname.pdf')
story = [Spacer(1, 2 * inch)]
story.append(Paragraph('<a href = page3.html#0>1. First Title</a>', pH)) # Linking the anchor to reference 0
story.append(Paragraph('<a href = page3.html#1>1.1. First Subtitle</a>', sH)) # Linking the anchor to reference 1
story.append(PageBreak())
story.append(Paragraph('<a name = page3.html#0></a> 1. First Title', pH)) # Creating anchor with reference 0
story.append(Paragraph('<a name = page3.html#1></a><br/> 1.1. First Subtitle', style)) # Creating anchor with reference 1
doc.build(story)

Categories

Resources