Writing Python output as xlsx - python

I want to execute same function (gives output A, B, C, D) for all the files (library) available in the given path. I am trying write the output (A,B,C,D) in four different columns of a sheet in xlsx. Also, the sheet name of xlsx should be same as the respected file available in the path.
I have written the following code:
def create_xlsx_file(xlsx_name, file_path):
workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
workbook.close()
libraries=os.listdir(file_path)
file_path=os.chdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name): ###library = all files in the given path
directory=os.chdir(os.getcwd())
workbook = openpyxl.load_workbook(xlsx_name)
worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
#print('library is: - ',library)
sheet=workbook[library] ###to create column headers
sheet.cell(column=1, row=1, value='value_A')
sheet.cell(column=2, row=1, value='value_B')
sheet.cell(column=3, row=1, value='value_C')
sheet.cell(column=4, row=1, value='value_D')
workbook.save(xlsx_name)
with open(library, 'r') as library:
for line in library:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
sheet=workbook[library]
sheet.cell(column=1, row=sheet.max_row+1, value=str(A))
sheet.cell(column=2, row=sheet.max_row, value=str(B))
sheet.cell(column=3, row=sheet.max_row, value=str(C))
sheet.cell(column=4, row=sheet.max_row, value=str(D))
print(f'library {library} has been written at {os.getcwd()}')
#time.sleep(1)
workbook.save(xlsx_name)
This code works absolutely fine for me but it is too slow to write xlsx file as my path has hundreds of .txt libraries and each library have more than millions of lines.
I could save the output(A,B,C,D) as .txt format and then can write xlsx file manually but it is very laboursome.
Is there any way to fasten this process? or any other fast xlsx writer is available?
Any help will be appreciated.
Thanks

I have found a faster way to save my data into excel is :
Since I have output as a result of a for loop, first save the output(A,B,C,D) into a dictionary and then save into excel using pandas.
def create_xlsx_file(xlsx_name, file_path):
workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
workbook.close()
libraries=os.listdir(file_path)
file_path=os.chdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name): ###library = all files in the given path
dic={'label_A':[], 'label_B':[],'label_C':[],'label_D':[]}# to store A,B,C,D values.
directory=os.chdir(os.getcwd())
workbook = openpyxl.load_workbook(xlsx_name)
worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
#print('library is: - ',library)
sheet=workbook[library] ###to create column headers
sheet.cell(column=1, row=1, value='value_A')
sheet.cell(column=2, row=1, value='value_B')
sheet.cell(column=3, row=1, value='value_C')
sheet.cell(column=4, row=1, value='value_D')
workbook.save(xlsx_name)
with open(library, 'r') as library:
for line in library:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
dic['label_A'].append(A)
dic['label_B'].append(B)
dic['label_C'].append(C)
dic['label_D'].append(D)
df=pd.DataFrame(data=dic, columns=['label_A', 'label_B', 'label_C', 'label_D'])
df.to_excel(xlsx_name, sheet_name=library)
print(f'library {library} has been written at {os.getcwd()}')
#time.sleep(1)
workbook.save(xlsx_name)

Well, if i understand correctly, you have a TXT file in which each line has only 4 words.
That is why you are doing:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
And if this is TRUE, you can just reach each TXT file as a data-frame and use the column/row splitting technique and assign the values to the columns.
And then you can save the result as required in the CSV.
This way is much faster than normally looping. And even if you have uneven number of words in a line, you can still use this method and subset the first 4 columns only and that should also solve your problem.

in my experience pandas library is pretty fast to handle information and has a function to export data with xlsx format.
You could create an empty DataFrame
data = pd.DataFrame()
Save your rows in pd.Series. For example:
row = pd.Series(data=[A,B,C,D], index = ['value_A', 'value_B', 'value_C', 'value_D'])
index parameter are the names of your columns and data parameter the values on each row.
Add each row to the DataFrame
data = data.append(row, ignore_index = True)
And just export DataFrame to xlsx
data.to_excel("output.xlsx")
references
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html
Don't forget install pandas library and import it to your program
import pandas as pd
I hope it works for you.

From your code it seems that you don't need any formatting or function of xlsx, and if you only need to centralize your data, a simple csv will do with only tiny changes on your code
import csv
def create_xlsx_file(xlsx_name, file_path):
with open(xlsx_name, 'w', encoding='UTF8') as output_file:
writer = csv.writer(output_file)
writer.writerow(['value_A','value_B','value_C','value_D'])
libraries=os.listdir(file_path)
for library in libraries: ### to create the sheets named same as the library
# print(library)
if library.endswith('.txt'):
# library=file_path+library
# print(library)
main(library, xlsx_name)
def main(library, xlsx_name)
with open(xlsx_name, 'a', encoding='UTF8') as output_file:
writer = csv.writer(output_file)
with open(library, 'r', encoding='UTF8') as input_file:
lines = input_file.read().splitlines()
for line in lines:
A=line.split(' ')[0]
B=line.split(' ')[1]
C=line.split(' ')[2]
D=line.split(' ')[3]
writer.writerow([A,B,C,D])
print(f'library {library} has been written at {os.getcwd()}')

Related

How to merge multiple .xls files with hyperlinks in python?

I am trying to merge multiple .xls files that have many columns, but 1 column with hyperlinks. I try to do this with Python but keep running into unsolvable errors.
Just to be concise, the hyperlinks are hidden under a text section. The following ctrl-click hyperlink is an example of what I encounter in the .xls files: ES2866911 (T3).
In order to improve reproducibility, I have added .xls1 and .xls2 samples below.
xls1:
Title
Publication_Number
P_A
ES2866911 (T3)
P_B
EP3887362 (A1)
.xls2:
Title
Publication_Number
P_C
AR118706 (A2)
P_D
ES2867600 (T3)
Desired outcome:
Title
Publication_Number
P_A
ES2866911 (T3)
P_B
EP3887362 (A1)
P_C
AR118706 (A2)
P_D
ES2867600 (T3)
I am unable to get .xls file into Python without losing formatting or losing hyperlinks. In addition I am unable to convert .xls files to .xlsx. I have no possibility to acquire the .xls files in .xlsx format. Below I briefly summarize what I have tried:
1.) Reading with pandas was my first attempt. Easy to do, but all hyperlinks are lost in PD, furthermore all formatting from original file is lost.
2.) Reading .xls files with openpyxl.load
InvalidFileException: openpyxl does not support the old .xls file format, please use xlrd to read this file, or convert it to the more recent .xlsx file format.
3.) Converting .xls files to .xlsx
from xls2xlsx import XLS2XLSX
x2x = XLS2XLSX(input.file.xls)
wb = x2x.to_xlsx()
x2x.to_xlsx('output_file.xlsx')
TypeError: got invalid input value of type <class 'xml.etree.ElementTree.Element'>, expected string or Element
import pyexcel as p
p.save_book_as(file_name=input_file.xls, dest_file_name=export_file.xlsx)
TypeError: got invalid input value of type <class 'xml.etree.ElementTree.Element'>, expected string or Element
During handling of the above exception, another exception occurred:
StopIteration
4.) Even if we are able to read the .xls file with xlrd for example (meaning we will never be able to save the file as .xlsx, I can't even see the hyperlink:
import xlrd
wb = xlrd.open_workbook(file) # where vis.xls is your test file
ws = wb.sheet_by_name('Sheet1')
ws.cell(5, 1).value
'AR118706 (A2)' #Which is the name, not hyperlink
5.) I tried installing older versions of openpyxl==3.0.1 to overcome type error to no succes. I tried to open .xls file with openpyxl with xlrd engine, similar typerror "xml.entree.elementtree.element' error occured. I tried many ways to batch convert .xls files to .xlsx all with similar errors.
Obviously I can just open with excel and save as .xlsx but this defeats the entire purpose, and I can't do that for 100's of files.
You need to use xlrd library to read the hyperlinks properly, pandas to merge all data together and xlsxwriter to write the data properly.
Assuming all input files have same format, you can use below code.
# imports
import os
import xlrd
import xlsxwriter
import pandas as pd
# required functions
def load_excel_to_df(filepath, hyperlink_col):
book = xlrd.open_workbook(file_path)
sheet = book.sheet_by_index(0)
hyperlink_map = sheet.hyperlink_map
data = pd.read_excel(filepath)
hyperlink_col_index = list(data.columns).index(hyperlink_col)
required_links = [v.url_or_path for k, v in hyperlink_map.items() if k[1] == hyperlink_col_index]
data['hyperlinks'] = required_links
return data
# main code
# set required variables
input_data_dir = 'path/to/input/data/'
hyperlink_col = 'Publication_Number'
output_data_dir = 'path/to/output/data/'
output_filename = 'combined_data.xlsx'
# read and combine data
required_files = os.listdir(input_data_dir)
combined_data = pd.DataFrame()
for file in required_files:
curr_data = load_excel_to_df(data_dir + os.sep + file, hyperlink_col)
combined_data = combined_data.append(curr_data, sort=False, ignore_index=True)
cols = list(combined_data.columns)
m, n = combined_data.shape
hyperlink_col_index = cols.index(hyperlink_col)
# writing data
writer = pd.ExcelWriter(output_data_dir + os.sep + output_filename, engine='xlsxwriter')
combined_data[cols[:-1]].to_excel(writer, index=False, startrow=1, header=False) # last column contains hyperlinks
workbook = writer.book
worksheet = writer.sheets[list(workbook.sheetnames.keys())[0]]
for i, col in enumerate(cols[:-1]):
worksheet.write(0, i, col)
for i in range(m):
worksheet.write_url(i+1, hyperlink_col_index, combined_data.loc[i, cols[-1]], string=combined_data.loc[i, hyperlink_col])
writer.save()
References:
reading hyperlinks - https://stackoverflow.com/a/7057076/17256762
pandas to_excel header formatting - Remove default formatting in header when converting pandas DataFrame to excel sheet
writing hyperlinks with xlsxwriter - https://xlsxwriter.readthedocs.io/example_hyperlink.html
Without a clear reproducible example, the problem is not clear. Assume I have two files called tmp.xls and tmp2.xls containing dummy data as in the two screenshots below.
Then pandas can easily, load, concatenate, and convert to .xlsx format without loss of hyperlinks. Here is some demo code and the resulting file:
import pandas as pd
f1 = pd.read_excel('tmp.xls')
f2 = pd.read_excel('tmp2.xls')
f3 = pd.concat([f1, f2], ignore_index=True)
f3.to_excel('./f3.xlsx')
Inspired by #Kunal, I managed to write code that avoids using Pandas libraries. .xls files are read by xlrd, and written to a new excel file by xlwt. Hyperlinks are maintened, and output file was saved as .xlsx format:
import os
import xlwt
from xlrd import open_workbook
# read and combine data
directory = "random_directory"
required_files = os.listdir(directory)
#Define new file and sheet to get files into
new_file = xlwt.Workbook(encoding='utf-8', style_compression = 0)
new_sheet = new_file.add_sheet('Sheet1', cell_overwrite_ok = True)
#Initialize header row, can be done with any file
old_file = open_workbook(directory+"/"+required_files[0], formatting_info=True)
old_sheet = old_file.sheet_by_index(0)
for column in list(range(0, old_sheet.ncols)):
new_sheet.write(0, column, old_sheet.cell(0, column).value) #To create header row
#Add rows from all files present in folder
for file in required_files:
old_file = open_workbook(directory+"/"+file, formatting_info=True)
old_sheet = old_file.sheet_by_index(0) #Define old sheet
hyperlink_map = old_sheet.hyperlink_map #Create map of all hyperlinks
for row in range(1, old_sheet.nrows): #We need all rows except header row
if row-1 < len(hyperlink_map.items()): #Statement to ensure we do not go out of range on the lower side of hyperlink_map.items()
Row_depth=len(new_sheet._Worksheet__rows) #We need row depth to know where to add new row
for col in list(range(old_sheet.ncols)): #For every column we need to add row cell
if col is 1: #We need to make an exception for column 2 being the hyperlinked column
click=list(hyperlink_map.items())[row-1][1].url_or_path #define URL
new_sheet.write(Row_depth, col, xlwt.Formula('HYPERLINK("{}", "{}")'.format(click, old_sheet.cell(row, 1).value)))
else: #If not hyperlinked column
new_sheet.write(Row_depth, col, old_sheet.cell(row, col).value) #Write cell
new_file.save("random_directory/output_file.xlsx")
I assume the same as daedalus in terms of the excel files. Instead of pandas I use openpyxl to read and create a new excel file.
import openpyxl
wb1 = openpyxl.load_workbook('tmp.xlsx')
ws1 = wb.get_sheet_by_name('Sheet1')
wb2 = openpyxl.load_workbook('tmp2.xlsx')
ws2 = wb.get_sheet_by_name('Sheet1')
csvDict = {}
# Go through first sheet to find the hyperlinks and keys.
for (row in ws1.max_row):
hyperlink_dict[ws1.cell(row=row, column=1).value] =
[ws1.cell(row=row, column=2).hyperlink.target,
ws1.cell(row=row, column=2).value]
# Go Through second sheet to find hyperlinks and keys.
for (row in ws2.max_row):
hyperlink_dict[ws2.cell(row=row, column=1).value] =
[ws2.cell(row=row, column=2).hyperlink.target,
ws2.cell(row=row, column=2).value]
Now you have all the data so you can create a new workbook and save the values from the dict into it via opnenpyxl.
wb = Workbook(write_only=true)
ws = wb.create_sheet()
for irow in len(csvDict):
#use ws.append() to add the data from the csv.
wb.save('new_big_file.xlsx')
https://openpyxl.readthedocs.io/en/stable/optimized.html#write-only-mode

How to write a dictionary to a text file with columns for each element in the dictionary in Python

I am trying to use a previously generated Workspace from matlab to create a input text file for another program to read from. I can import the Workspace into Python with no issues as a dictionary. From here I am having difficulty writing to a text file due to the different data types and different sized arrays in the dictionary. I would like for each field in the dictionary to have its own column in the text file but have had little luck.
Here is a screen shot of the imported dictionary from matlab.
I would like the text file to have this format. (the header is not necessary)
I was able to get one of the variables into the text file with the following code but, I can't add more variables or ones with different data types.
import csv
import scipy.io
import numpy as np
#import json
#import _pickle as pickle
#import pandas as pd
mat = scipy.io.loadmat('Day055.mat')
#print(type(mat))
#print(mat['CC1'])
CC1 = mat['CC1']
CC2 = mat['CC2']
DP1 = mat['DP1']
#print(CC1)
#print(CC2)
dat = np.array([CC1, DP1])
dat =dat.T
#np.savetxt('tester.txt', dat, delimiter = '\t')
np.savetxt('tester.txt', CC1, delimiter = '\t')
'''
with open('test.csv', 'w') as f:
writer = csv.writer(f)
for row in CC1:
writer.writerow(row)
#print(type(CC1))
#print("CC1=", CC1)
#print("first entry to CC1:", CC1[0])
mat = {'mat':mat}
df = pd.DataFrame.from_dict(mat)
print(df)
print(type(mat))
x=0
with open('inputDay055.txt', 'w') as file:
for line in CC1:
file.write(CC1[x])
#file.write("\t".join(map(str, CC2))+"\n")
#file.write(pickle.dumps(mat))
x=x+1
'''
print("all done")
As you can see I have tried a few different ways as well but commented them out when I was not succesful.

Writing CSV row values to a PDF using Python

I've been using some great answers on Stack Overflow to help solve my problem, but I've hit a roadblock.
What I'm trying to do
Read values from rows of CSV
Write the values from the CSV to Unique PDFs
Work through all rows in the CSV file and write each row to a different unique PDF
What I have so far
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
import pandas as pd
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
# Read CSV into pandas dataframe and assign columns as variables
csv = '/myfilepath/test.csv'
df = pd.read_csv(csv)
Name = df['First Name'].values + " " + df['Last Name'].values
OrderID = df['Order Number'].values
packet = io.BytesIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.setFont("Helvetica", 12)
if OrderID is not None:
can.drawString(80, 655, '#' + str(OrderID)[1:-1])
can.setFont("Helvetica", 16)
if Name is not None:
can.drawString(315, 630, str(Name)[2:-2]
can.save()
# move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open("Unique1.pdf", "rb"))
output = PdfFileWriter()
# add the new pdf to the existing page
page = existing_pdf.getPage(0)
page2 = new_pdf.getPage(0)
page.mergePage(page2)
output.addPage(page)
# finally, write "output" to a real file
outputStream = open("Output.pdf", "wb")
output.write(outputStream)
outputStream.close()
The code above works if:
I specify the PDF that I want to write to
I specify the output file name
The CSV only has 1 row
What I need help with
Reading values from the CSV one row at a time and storing them as a variable to write
Select a unique PDF, and write the values from above, then save that file and select the next unique PDF
Loop through all rows in a CSV and end when the last row has been reached
Additional Info: the unique PDFs will be contained in a folder as they each have the same layout but different barcodes
Any help would be greatly appreciated!
I would personally suggest that you reconsider using Pandas and instead try the standard CSV module. It will meet your need for streaming through a file for row-by-row processing. Shown below is some code looping through a CSV file getting each row as a dictionary, and processing that in a write_pdf function, as well as logic that will get you a new filename to write the PDF to for each row.
import csv
# import the PDF libraries you need
def write_pdf(data, filename):
name = data['First Name'] + ' ' + data['Last Name']
order_no = data['Order Number']
# Leaving PDF writing to you
row_counter = 0
with open('file.csv', 'r') as f:
reader = csv.DictReader(f)
for row in reader:
write_pdf(row, 'Output' + row_counter + '.pdf')
row_counter += 1
I'm going to leave the PDF writing to you because I think you understand what you need from that better than I do.
I known I cut out the Pandas part, but I think the issue are having with that, and how it doesn't work for a CSV with more than 1 row stems from DataFrame.get being an operation that retrieve an entire column.
Python CSV module docs
pandas DataFrame docs

Changing the format of cells in python 2.7 using xlsxwriter or xlwt

I am still relatively new to python and having difficulty with a little project. I am converting a csv file to xls file which I have accomplished, and now I am wanting to perform some calculations on some parts of the new xls file. All values are stored as Text, I am trying to convert the cells with numbers into numbers.
From my reading you cannot change cell type without erasing the data stored in the cell using xlsxwriter, is there a way to format the cells prior to writing the data? I have not found any way to convert cell type using xlwt.
I do not think conditional formatting will work in this case as I have not seen a way for it to detect a number and change the format of the cell.
My simple program just copies the csv file and creates a xls file, if there is a way to format the cells as they are being written that is great, if not I know the exact cells the data will be going into so I can format them prior to being written.
import csv, xlsxwriter
def makeXls(path, fileName):
wb = xlsxwriter.Workbook(fileName+'.xls')
ws = wb.add_worksheet('Report')
with open(path, "rb") as f:
reader = csv.reader((line.replace('\0','') for line in f), delimiter=';')
for r, row in enumerate(reader):
for c, val in enumerate(row):
ws.write(r,c,val)
The example I've posted below works with your existing code to show how you might use conditional formatting in excel via the xlsxwriter module (when a value is greater than or equal to zero in this example the font color is set to a dark red).
I made up a small test.csv that is written/read to/from the current directory and used to create test.xls. Also, please note that I removed the "rb" for reading in the .csv file that I made.
More information on working with conditional formats in the Xlsxwriter module can be found here
import pandas as pd
import numpy as np
import csv
import xlsxwriter
# This creates a sample .csv file used in my example
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
df.to_csv('test.csv' , sep = ';')
def makeXls(path, fileName):
wb = xlsxwriter.Workbook(fileName+'.xls', {'strings_to_numbers': True})
format_1 = wb.add_format({'num_format': '#.########'})
format_2 = wb.add_format({'num_format': '#.########', 'font_color' : '#9C0006'})
ws = wb.add_worksheet('Report')
with open(path) as f:
reader = csv.reader((line.replace('\0','') for line in f), delimiter=';')
for r, row in enumerate(reader):
for c, val in enumerate(row):
ws.write(r,c, val, format_1)
ws.conditional_format('B2:K50', {'type':'cell', 'criteria': '>=', 'value': 0, 'format': format_2})
ws.set_column(0, 3, 15)
makeXls(path = 'test.csv', fileName = 'test')
Expected Output:
For best results with the xlsxwriter module, I would recommend using the .xlsx file format rather than the .xls.
All values are stored as Text, I am trying to convert the cells with numbers into numbers.
You can use the XlsxWriter strings_to_numbers constructor option:
wb = xlsxwriter.Workbook(fileName + '.xlsx', {'strings_to_numbers': True})
From the XlsxWriter docs:
strings_to_numbers: Enable the worksheet.write() method to convert strings to numbers, where possible, using float() in order to avoid an Excel warning about "Numbers Stored as Text".

How do I extract data from multiple text files to Excel using Python? (One file's data per sheet)

So far for my code to read from text files and export to Excel I have:
import glob
data = {}
for infile in glob.glob("*.txt"):
with open(infile) as inf:
data[infile] = [l[:-1] for l in inf]
with open("summary.xls", "w") as outf:
outf.write("\t".join(data.keys()) + "\n")
for sublst in zip(*data.values()):
outf.write("\t".join(sublst) + "\n")
The goal with this was to reach all of the text files in a specific folder.
However, when I run it, Excel gives me an error saying,
"File cannot be opened because: Invalid at the top level of the document. Line 1, Position 1. outputgooderr.txt outputbaderr.txt. fixed_inv.txt
Note: outputgooderr.txt, outputbaderr.txt.,fixed_inv.txt are the names of the text files I wish to export to Excel, one file per sheet.
When I only have one file for the program to read, it is able to extract the data. Unfortunately, this is not what I would like since I have multiple files.
Please let me know of any ways I can combat this. I am very much so a beginner in programming in general and would appreciate any advice! Thank you.
If you're not opposed to having the outputted excel file as a .xlsx rather than .xls, I'd recommend making use of some of the features of Pandas. In particular pandas.read_csv() and DataFrame.to_excel()
I've provided a fully reproducible example of how you might go about doing this. Please note that I create 2 .txt files in the first 3 lines for the test.
import pandas as pd
import numpy as np
import glob
# Creating a dataframe and saving as test_1.txt/test_2.txt in current directory
# feel free to remove the next 3 lines if yo want to test in your directory
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
df.to_csv('test_1.txt', index=False)
df.to_csv('test_2.txt', index=False)
txt_list = [] # empty list
sheet_list = [] # empty list
# a for loop through filenames matching a specified pattern (.txt) in the current directory
for infile in glob.glob("*.txt"):
outfile = infile.replace('.txt', '') #removing '.txt' for excel sheet names
sheet_list.append(outfile) #appending for excel sheet name to sheet_list
txt_list.append(infile) #appending for '...txt' to txtt_list
writer = pd.ExcelWriter('summary.xlsx', engine='xlsxwriter')
# a for loop through all elements in txt_list
for i in range(0, len(txt_list)):
df = pd.read_csv('%s' % (txt_list[i])) #reading element from txt_list at index = i
df.to_excel(writer, sheet_name='%s' % (sheet_list[i]), index=False) #reading element from sheet_list at index = i
writer.save()
Output example:

Categories

Resources