How to wrap text in excel using Python? - python

I created a script which parse YAML files to Excel. I've made part of code which adjust width of column but now I want to wrap text for Desciprtion column.
After script execution:
What I want:
Here is my code:
import yaml
import os
import pandas as pd
from openpyxl.utils import get_column_letter
import sys
def yaml_to_excel(dictionary, row):
for key, value in dictionary.items():
if type(value) is dict:
yield from yaml_to_excel(value, row + 1)
elif type(value) is list:
for item in value:
yield from yaml_to_excel(item, row + 1)
row += 1
else:
yield (key, value, row)
def get_all_subdirectiries_in_path(root_dir):
all_subdirectories = []
for file in os.listdir(root_dir):
d = os.path.join(root_dir, file)
if os.path.isdir(d):
all_subdirectories.append(d)
return all_subdirectories
def dfs_tabs(df_list, sheet_list, file_name, columns):
writer = pd.ExcelWriter(file_name,engine='openpyxl')
for dataframe, sheet, columns_list in zip(df_list, sheet_list, columns):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False, columns=columns_list)
for column in dataframe:
column_width = max(dataframe[column].astype(str).map(len).max() + 5, len(column) + 5)
col_letter = get_column_letter(dataframe.columns.get_loc(column) + 1)
writer.sheets[sheet].column_dimensions[col_letter].width = column_width
writer.save()
def get_nested_dir_path(main_dir):
if any(File.endswith(".yml") for File in os.listdir(main_dir)):
yield main_dir
else:
for file in os.listdir(main_dir):
yield from get_nested_dir_path(main_dir + f"/{file}")
def main(dir_path):
all_sheets = []
sheet_names = []
all_columns = []
for subdirectory in get_nested_dir_path(dir_path):
final_object = []
last_row = 1
current_row = 1
for file in os.listdir(subdirectory):
# Check if YAML file exist
if (file.split(".")[1] == "yml") | (file.split(".")[1] == "yaml"):
with open(subdirectory + f"/{file}", "r", encoding="utf-8") as f:
dataMap = yaml.safe_load(
f,
)
last_row += 1
hierarchy_obj = {}
hierarchy = 1
prev_value = ""
prev_temp_depth = 2
current_row += 1
starting_row = current_row
for key, value, temp_depth in yaml_to_excel(dataMap, last_row):
if key not in hierarchy_obj:
hierarchy_obj[key] = hierarchy
hierarchy += 1
if prev_value != "":
if hierarchy_obj[key] < hierarchy_obj[prev_value]:
current_row += 1
if (temp_depth > prev_temp_depth) & (
hierarchy_obj[key] > hierarchy_obj[prev_value]
):
current_row += 1
if type(value) is bool:
if value:
value = 'yes'
else:
value = 'no'
final_object.append(
{
"column": hierarchy_obj[key],
"value": value,
"row": current_row,
}
)
prev_value = key
prev_temp_depth = temp_depth
columns_arr = []
for column_name in hierarchy_obj.keys():
columns_arr.append(column_name)
all_columns.append(columns_arr)
last_row_in_excel = final_object[len(final_object)-1]['row']
main_arr = []
for i in range(last_row_in_excel):
main_arr.append(["" for i in range(len(columns_arr))])
main_arr = [main_arr]
for item in final_object:
if (type(item['value']) == int) | (type(item['value']) == float):
main_arr[0][item['row'] - 1][item['column'] - 1] = item['value']
else:
main_arr[0][item['row'] - 1][item['column'] -1] = item['value']
result = pd.DataFrame(main_arr[0], columns = columns_arr)
result = result[1:]
all_sheets.append(result)
sheet_names.append(subdirectory[subdirectory.rfind('/')+1 :].replace("/", " ").replace("_", " ")[0:31])
dfs_tabs(all_sheets, sheet_names, 'test.xlsx', all_columns)
if __name__ == "__main__":
dir_path = 'Lands'
main(dir_path)
And below you can find YAML file which I base of:
Country: France
Capital City: Paris
Language: French
Description: |-
A country whose metropolitan part is in Western Europe, and also has overseas territories on other continents.
The most important people:
President: Emmanuel Macron
Prime Minister: Élisabeth Borne
Year: 2020
Population:
- Total population: 67 522 000
Year: 2021
Nationality:
- French population: 87%
German population: 2%
Year: 2022
Paris neighborhoods:
- 1st district of Paris: Louvre
Paris II district: Bourse
Year: 2023
Additional information:
To run my script you must create folder Lands/France/france.yml
I will be grateful for any tips how to wrap text in Description column!

I suggest you in this particular situation to parse the .YAML file with pandas.json_normalize and use the format class of xlswriter to wrap the text in the column Description.
Try this :
import pandas as pd
from yaml import safe_load
with open(r"Lands\France\france.yaml", 'r', encoding='utf-8') as f:
df = pd.json_normalize(safe_load(f))
# --- Selecting columns
description_index = df.columns.get_loc("Description")
df.iloc[:, 0:description_index+1]
with pd.ExcelWriter('final_result.xlsx') as writer :
df.to_excel(writer, sheet_name='Sheet1', index=False)
# --- Wrapping the text of column D
workbook=writer.book
worksheet = writer.sheets['Sheet1']
format = workbook.add_format({'text_wrap': True})
worksheet.set_column(description_index, description_index, None, format)
# --- Auto-fit all the columns
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
worksheet.set_column(col_idx, col_idx, column_width)
# Output (in Excel) :

Related

Transform lines from .txt file into .xls columns pandas

i have a text file that contains datas on lines like his:
Total usage 2022-09-17T06:02:50+02:00 for vob "/Tmobile_Cro/TMobileCro" is 7868.0 Mb
Total usage 2022-09-17T06:04:18+02:00 for vob "/XpressCM/SX-BASE" is 25265.7 Mb
Total usage 2022-09-17T06:02:56+02:00 for vob "/vobs/LI" is 5916.9 Mb
I want to process this data and export into an excel file as:
TAG Size
/Tmobile_Cro/TmobileCro 7868.0 Mb
/XpressCM/SX-BASE 25265.7 Mb
This is my code:
import xlrd, xlwt, re
from svnscripts.timestampdirectory import createdir, path_dir
import os
def clearcasesize():
pathdest = path_dir()
dest = createdir()
txtName = rf"{pathdest}\vobs_size.txt"
workBook = xlwt.Workbook(encoding='ascii')
workSheet = workBook.add_sheet('sheet1')
fp = open(txtName, 'r+b')
workSheet.write(0, 0, "TAG")
workSheet.write(0, 1, "Size")
row = 0
entries = 0
fullentry = []
for linea in fp.readlines():
str_linea = linea.decode('gb2312', 'ignore')
str_linea = str_linea[:-2] # str string
txt = str_linea
arr = str_linea
if arr[:9] == "Total":
txt = arr
entries += 1
s = txt.index("/")
e = txt.index('"', s)
txt = txt[s:e]
fullentry.append(txt)
elif arr.find("is") >= 0:
entries += 1
txt = arr
s = txt.index("is")
txt1 = txt[s + 7:20]
fullentry.append(txt1)
if (row == 65536):
break;
finalarr = []
finalarr1 = []
temp = 0
row = 1
for r in fullentry:
finalarr.append(r)
temp += 1
if temp == 12:
finalarr1.append(finalarr)
temp = 0
col = 0
for arr in finalarr:
workSheet.write(row, col, arr)
col += 1
row += 1
finalarr.clear()
if (row == 65536):
break;
workBook.save(os.path.join(dest, "ClearcaseSize.xls"))
fp.close()
clearcasesize()
The code should work, but its only creating the name of columns "Tag and Size"
The idea of my script is that he locate the first argg of the line "Total", and after he puts into 1 column the args that we find into " " , and after using the other arr.fin "is" to add the size into the second line, but its not working...
Based on your short sample:
Total usage 2022-09-17T06:02:50+02:00 for vob "/Tmobile_Cro/TMobileCro" is 7868.0 Mb
Total usage 2022-09-17T06:04:18+02:00 for vob "/XpressCM/SX-BASE" is 25265.7 Mb
Total usage 2022-09-17T06:02:56+02:00 for vob "/vobs/LI" is 5916.9 Mb
Why don't you try pandas like this:
import pandas as pd
with open("sample_file.txt") as f:
lines = [
line.strip().split(" vob ")[-1].replace('"', "").split(" is ")
for line in f.readlines()
]
df = pd.DataFrame(
[l for l in lines if not l[0].startswith("/vobs")],
columns=["TAG", "Size"],
)
df.to_excel("sample_file.xlsx", index=False)
I don't have Excel but you should get an xlsx file that looks like this:

How can I plot the monthly average temperature?

I wrote this code in order to obtain a series of monthly weather observations at Helsinki for a period from 1960 to 2020 and then I saved the data to a local file using package pickle. I used the data available from the API provided by Finnish Meteorological Institute.
import datetime
from pprint import pprint
import pickle
import pandas as pd
class FmiApi:
"""
a minimalistic wrapper for the Finnish Meteorological Institute API
"""
def __init__(self):
self.base_url = 'http://opendata.fmi.fi/wfs'
self.fixed_params = {
'service': 'WFS',
'version':'2.0.0',
}
self.namespaces ={
'wfs':'http://www.opengis.net/wfs/2.0',
'gml':'http://www.opengis.net/gml/3.2',
'BsWfs':'http://xml.fmi.fi/schema/wfs/2.0'
}
try:
with open('wdata.pkl', 'rb') as f:
data=pickle.load(f)
except:
wheaters=[]
for year in range(1960,2021):
for month in range(1,13):
api = FmiApi()
w = api.get_monthly_obs('helsinki', year, month )
wheaters.append(w)
pprint(w)
with open('wdata.pkl', 'wb') as f:
pickle.dump(wheaters, f)
Now I want to use the local file to access the data in order to plot the monthly average temperature for the years 1960 to 2020.
I wrote this code but it doesn't print the average temperature:
def get_data(file_in=None, file_out=None):
assert file_in != None or file_out != None
#read wdata from local file
df_weath = pd.read_csv('wdata.pkl', parse_dates=[0], infer_datetime_format=True)
df_weath.sort_values('Date', inplace=True, ignore_index=True)
df_weath['Date'] = df_weath['Date'].dt.date #convert to datetime objects
#print(df_wdata)
#input()
#get weather data in the format returned by the api
if file_in != None: #read weather data from local file
with open(file_in, 'rb') as f:
wdata = pickle.load(f)
else: #use the FMI API to get weather data
api = FmiApi() #instantiate the api
params = {
'place': u'helsinki',
'maxlocations': '5',
}
d0 = df_weath['date'].values[0]
d1 = df_weath['date'].values[-1]
n_days = (d1 - d0).days + 1 #number of days between d0 and d1
wdata = []
for i_day in range(n_days):
date = d0 + datetime.timedelta(days=i_day)
params['starttime'] = str(date) + 'T00:00:00'
params['endtime'] = str(date) + 'T00:00:00'
try:
print('requesting weather data for %s'%str(date))
weather = api.get_daily_obs(params)
except:
print('getting weather failed, skipping')
continue
wdata.append(weather)
if file_out != None: #save weather data to a file
with open(file_out, 'wb') as f:
pickle.dump(wdata, f)
#move weather data to a pandas dataframe (calculate avg over valid observations)
data = []
variables = ['tday']
for wobs in wdata:
avgs = {}
for pos, obs in wobs.items():
for var, xx in obs.items():
if not var in variables: continue
sdate = xx[1][:10]
date = datetime.date(*[int(z) for z in sdate.split('-')])
if xx[0] != 'NaN':
val = float(xx[0])
else:
val = None
if not var in avgs: avgs[var] = []
if val != None: avgs[var].append(val)
vals = []
for var in variables: #calculate the average when available
if len(avgs[var]) > 0:
vals.append(sum(avgs[var])/len(avgs[var]))
else:
vals.append(None)
row = [date] + vals
data.append(row)
df_weather = pd.DataFrame(data, columns=['date'] + variables)
print(df_weather)
input()
#merge the dataframes by date (pick dates from weather df and add NaNs for missing values)
df = pd.merge(df_weather, df_weath, on='date', how='left', sort=True)
return df
The dictionary is like this:
{(65.90051, 29.06302): {'rrmon': ('18.4', '1985-01-01T00:00:00Z'),
'tmon': ('NaN', '1985-01-01T00:00:00Z')},
(65.91718, 29.22969): {'rrmon': ('15.0', '1985-01-01T00:00:00Z'),
'tmon': ('NaN', '1985-01-01T00:00:00Z')},
(65.95515, 29.1347): {'rrmon': ('16.9', '1985-01-01T00:00:00Z'),
'tmon': ('NaN', '1985-01-01T00:00:00Z')}}
Can you help me to find what I am doing wrong? Or do you know any easier way to plot the monthly average temperature?

Making Excel histograms using python

This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

How do I get the calculated column names from a panda pivot table

I am trying to create an excel spreadsheet from a pandas.pivot_table.
I don't want to use to_excel as it:
Renders at half the speed
Doesn't allow cell formatting
Doesn't compute cell widths
Doesn't allow me to create other cells with formulas
etc.
create a dataframe from a list of lists
convert to a pivot table with pivot_table
convert to records with to_records
Now I can create a worksheet, but I need the column headers for the indexes, which I can save as they were specified when creating the pivot table, but how do I get the inferred column_names deduced from the distinct values?
import pandas
import datetime
import logging
import math
import random
from dateutil.relativedelta import relativedelta
import xlsxwriter
logging.basicConfig(level=logging.INFO)
def get_sales_data(*, num_records, num_custs, num_products, date_from, number_of_months):
print("number of months %s" % number_of_months)
sales = {}
rows = []
for cust_nbr in range(0, num_custs):
cust_name = "cust " + "{0:0>3}".format(cust_nbr)
for month_delta in range(0, number_of_months):
ship_date = date_from + relativedelta(months=month_delta)
for product_nbr in (0, num_products):
product = "product " + str(product_nbr)
qty = random.randint(0, 20)
if (qty > 0):
key = (cust_name, product, ship_date)
shipment = (cust_name, product, ship_date, qty)
sales[key] = shipment
for shipment in sales.values():
rows.append(shipment)
return rows
def to_excel(workbook, sheetname, dataframe):
worksheet = workbook.add_worksheet(sheetname)
row_index = 1
max_widths = [None] * len(dataframe[0])
for row in dataframe:
#max_widths = [None] * len(row)
col_index = 0
for datum in row:
if datum is not None:
if isinstance(datum, float):
if not math.isnan(datum):
worksheet.write(row_index, col_index, datum)
else:
worksheet.write(row_index, col_index, datum)
# print ("len(max_widths) %s col_index %s " % (len(max_widths),col_index))
if max_widths[col_index] is None or len(str(datum)) > max_widths[col_index]:
max_widths[col_index] = len(str(datum))
# if row_index < 5:
# print("r: %s c: %s %s" % (row_index, col_index, datum))
col_index += 1
row_index += 1
col_index = 0
for width in max_widths:
worksheet.set_column(col_index, col_index, width + 1)
col_index += 1
# Get a List of Lists
from_date = datetime.date(2015, 1, 1)
to_date = datetime.date(2017, 7, 1)
matrix = get_sales_data(num_records=3000, num_products=3, date_from=from_date, number_of_months=30, num_custs=1000)
# Get a dataframe
labels = ["cust_name", "product", "ship_date", "qty"]
dataframe = pandas.DataFrame.from_records(matrix, columns=labels)
print(dataframe)
# get pivot
pivot_table = pandas.pivot_table(dataframe, columns='ship_date', values='qty', index=['cust_name', 'product'])
print(pivot_table)
# convert back to records
records = pivot_table.to_records()
# get excel
output = open("/tmp/crosstab.xslx", "wb")
workbook = xlsxwriter.Workbook(output)
to_excel(workbook, "Cust Product By Month", records)
workbook.close()

How to split merged cells in excel workbook using python

Is there any way to split/unmerge cells in excel workbook using python? What I want is explained below -
The result should a new excel file with following entries -
My solution using xlrd to copy the same string for all merged column is as given below -
[Note: "formatted_info = True" flag is not yet implemented in xlrd which I am using hence I cannot directly get the list of merged cells.. I am not supposed to upgrade xlrd on the setup.]
def xlsx_to_dict():
workbook = xlrd.open_workbook(xlsfile)
worksheet_names = workbook.sheet_names()
for worksheet_name in worksheet_names:
worksheet = workbook.sheet_by_name(worksheet_name)
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
curr_row = -1
header_row = worksheet.row(0)
columns = []
for cell in range(len(header_row)):
value = worksheet.cell_value(0, cell)
columns.append(value)
cities = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,0)
type = worksheet.cell_type(row,0)
if not value == "":
cities.append(value)
names = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,1)
type = worksheet.cell_type(row,1)
if not value == "":
names.append(value)
current_city = cities[0]
result_dict = {}
for curr_row in range(1,num_rows):
row = worksheet.row(curr_row)
curr_cell = -1
curr_name = names[0]
while curr_cell < num_cells:
curr_cell += 1
cell_value = worksheet.cell_value(curr_row, curr_cell)
if cell_value in cities and curr_cell == 0:
current_city = cell_value
if not result_dict.has_key(current_city):
result_dict[current_city] = {}
continue
if cell_value == "" and curr_cell == 0:
continue
if cell_value in names and curr_cell == 1:
curr_name = cell_value
if not result_dict[current_city].has_key(curr_name):
result_dict[current_city][curr_name] = {}
continue
if cell_value == "" and curr_cell == 1:
continue
try:
result_dict[current_city][curr_name]['Phone'].append(cell_Value)
except:
result_dict[current_city][curr_name]['Phone'] = [cell_value]
The above function will return python dictionary as below -
{ 'New York' : { 'Tom' : [92929292, 33929] }, ........}
I will then traverse the directory and write new excel.
However, I want some generic way of splitting merged cells.
This function gets the "real" cell value, i.e., the value of the merged cell if the coordinates are anywhere inside the merged cell.
def unmergedValue(rowx,colx,thesheet):
for crange in thesheet.merged_cells:
rlo, rhi, clo, chi = crange
if rowx in xrange(rlo, rhi):
if colx in xrange(clo, chi):
return thesheet.cell_value(rlo,clo)
#if you reached this point, it's not in any merged cells
return thesheet.cell_value(rowx,colx)
Loosely based on http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Sheet.merged_cells-attribute
Very innefficient, but should be acceptable for small-ish spreadsheets.
if your file has no empty cells in the middle, this may help, read the file, do some job, rewrite it.
def read_merged_xls(file_contents):
book = xlrd.open_workbook(file_contents=file_contents)
data = []
sheet = book.sheet_by_index(0)
for rx in range(sheet.nrows):
line = []
for ry in range(sheet.ncols):
cell = sheet.cell_value(rx,ry)
if not cell:
cell = data[-1][ry] if data else ''
line.append(cell)
data.append(line)
return data
import xlrd
import xlsxwriter
import numpy as np
import pandas as pd
def rep(l,i):
j= i
while(j>=0):
if not l[j-1] == u'':
return l[j-1]
else:
j = j-1
def write_df2xlsx(df,filename):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(filename,engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index = False)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def csv_from_excel(filename):
wb = xlrd.open_workbook(filename)
worksheet_names = wb.sheet_names()
for worksheet_name in worksheet_names:
sh = wb.sheet_by_name(worksheet_name)
#To find the headers/column names of the xlsx file
header_index = 0
for i in range(sh.nrows):
if(len(filter(lambda x: not (x.value == xlrd.empty_cell.value), sh.row(i))) == len(sh.row(i))):
header_row = sh.row(i)
header_index = i
break
columns = []
for cell in range(len(header_row)):
value = sh.cell_value(header_index, cell)
columns.append(value)
rows = []
for rownum in range(header_index+1,sh.nrows):
rows.append(sh.row_values(rownum))
data = pd.DataFrame(rows,columns = columns)
cols = [col for col in data.columns if u'' in list(data[col])]
res = []
for col in cols:
t_list = list(data[col])
res.append(map(lambda x,y: rep(list(data[col]),y[0]) if x == u'' else x,t_list,enumerate(t_list)))
for (col,r) in zip(cols,res):
data[col] = pd.core.series.Series(r)
write_df2xlsx(data,'ResultFile.xlsx')

Categories

Resources