Convert excel to XML in python - python

I am trying to convert excel database into python.
I have a trading data which I need to import into the system in xml format.
my code is following:
df = pd.read_excel("C:/Users/junag/Documents/XML/Portfolio2.xlsx", sheet_name="Sheet1", dtype=object)
root = ET.Element('trading-data')
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
tree = ET.ElementTree(root)
Portfolios = ET.SubElement(root, "Portfolios")
Defaults = ET.SubElement(Portfolios, "Defaults", BaseCurrency="USD")
for row in df.itertuples():
Portfolio = ET.SubElement(Portfolios, "Portfolio", Name=row.Name, BaseCurrency=row.BaseCurrency2, TradingPower=str(row.TradingPower),
ValidationProfile=row.ValidationProfile, CommissionProfile=row.CommissionProfile)
PortfolioPositions = ET.SubElement(Portfolio, "PortfolioPositions")
if row.Type == "Cash":
PortfolioPosition = ET.SubElement(PortfolioPositions, "PortfolioPosition", Type=row.Type, Volume=str(row.Volume))
Cash = ET.SubElement(PortfolioPosition, 'Cash', Currency=str(row.Currency))
else:
PortfolioPosition = ET.SubElement(PortfolioPositions, "PortfolioPosition", Type=row.Type, Volume=str(row.Volume),
Invested=str(row.Invested), BaseInvested=str(row.BaseInvested))
Instrument = ET.SubElement(PortfolioPosition, 'Instrument', Ticker=str(row.Ticker), ISIN=str(row.ISIN), Market=str(row.Market),
Currency=str(row.Currency2), CFI=str(row.CFI))
ET.indent(tree, space="\t", level=0)
tree.write("Portfolios_converted2.xml", encoding="utf-8")
The output looks like this:
enter image description here
While I need it to look like this:
enter image description here
How can I improve my code to make the output xml look better? please advise
here the excel data:

Since you need a single <Portfolio> and <PortfolioPositions> as parent grouping, consider a nested loop by iterating through a list of data frames. Then, within each data frame loop through its rows:
import xml.etree.ElementTree as ET
import pandas as pd
import xml.dom.minidom as md
df = pd.read_excel("Input.xlsx", sheet_name="Sheet1", dtype=object)
# LIST OF DATA FRAME SPLITS
df_list = [g for i,g in df.groupby(
["Name", "BaseCurrency2", "TradingPower", "ValidationProfile", "CommissionProfile"]
)]
# ROOT LEVEL
root = ET.Element('trading-data')
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
# ROOT CHILD LEVEL
Portfolios = ET.SubElement(root, "Portfolios")
Defaults = ET.SubElement(Portfolios, "Defaults", BaseCurrency="USD")
# GROUP LEVEL ITERATION
for df in df_list:
Portfolio = ET.SubElement(
Portfolios,
"Portfolio",
Name = df["Name"][0],
BaseCurrency = df["BaseCurrency2"][0],
TradingPower = str(df["TradingPower"][0]),
ValidationProfile = df["ValidationProfile"][0],
CommissionProfile = df["CommissionProfile"][0]
)
PortfolioPositions = ET.SubElement(Portfolio, "PortfolioPositions")
# ROW LEVEL ITERATION
for row in df.itertuples():
if row.Type == "Cash":
PortfolioPosition = ET.SubElement(
PortfolioPositions,
"PortfolioPosition",
Type = row.Type,
Volume = str(row.Volume)
)
Cash = ET.SubElement(
PortfolioPosition,
"Cash",
Currency = str(row.Currency)
)
else:
PortfolioPosition = ET.SubElement(
PortfolioPositions,
"PortfolioPosition",
Type = row.Type,
Volume = str(row.Volume),
Invested = str(row.Invested),
BaseInvested = str(row.BaseInvested)
)
Instrument = ET.SubElement(
PortfolioPosition,
"Instrument",
Ticker = str(row.Ticker),
ISIN = str(row.ISIN),
Market = str(row.Market),
Currency = str(row.Currency2),
CFI = str(row.CFI)
)
# SAVE PRETTY PRINT OUTPUT
with open("Output.xml", "wb") as f:
dom = md.parseString(ET.tostring(root))
f.write(dom.toprettyxml().encode("utf-8"))

Converting excel to XML in python
import openpyxl
import xml.etree.ElementTree as ET
def convert_excel_to_xml(file_name, sheet_name):
wb = openpyxl.load_workbook(file_name)
sheet = wb[sheet_name]
root = ET.Element("root")
for row in sheet.rows:
for cell in row:
ET.SubElement(root, "cell", value=cell.value)
tree = ET.ElementTree(root)
tree.write("{}.xml".format(sheet_name))
Run the function
convert_excel_to_xml("test.xlsx", "Sheet1")

Related

Loop through multiple xml files

I'm fairly new to python and would like to loop through multiple xml files. I'm currently using the existing code to pull in sample2 xml file:
import xml.etree.ElementTree as ET
import pandas as pd
import os
tree=ET.parse("sample2.xml")
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]
I tried adding soup xml to the below line of code but this didn't work
tree=ET.parse("sample2.xml , "soup xml")
root = tree.getroot()
Consider turning your code into a function and calling it for the various files you need:
import xml.etree.ElementTree as ET
import pandas as pd
import os
def my_xml_processor(filename):
tree=ET.parse(filename)
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]
return df2
You can then call it for your files:
my_xml_processor("sample2.xml")
my_xml_processor("soup.xml")
EDIT: these are some minor code changes that I'd recommend:
import xml.etree.ElementTree as ET
import pandas as pd
import os
def my_xml_processor(filename:str)->pd.DataFrame: # <- Add type hints
root = ET.parse(filename).getroot() # <- tree is not used
qty = root.iterfind(".//Qty")
pri = root.iterfind(".//PriceAmount")
cor = root.iterfind(".//AuctionIdentification")
data = [ # <- This could be a list comprehension
(x.get('v'), y.get('v'), z.get('v'))
for x,y,z in zip(qty, pri, cor)
]
df = (pd
.DataFrame(data, columns=["Qty", "Price" , "Border"])
.astype({
'Qty': float,
'Price': float,
})
)
df2 = df.agg({
'Qty':'sum',
'Price':'mean',
'Border': lambda x: str(x[0])[:12]
}).to_frame().T
return df2
You could use your existing code, but running it in a loop for each filename you have, something like:
import xml.etree.ElementTree as ET
import pandas as pd
import os
files = ['sample2.xml', 'sample3.xml', 'sample4.xml']
for file in files: #read each filename from above list
tree=ET.parse(file)
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]

Is there a proper way to append JSON Data to a Numpy array

I am trying to add data that I am reading from a series of JSON files to a Numpy array (or whatever data collection would work best). My idea, is that I want to sort a collection of episodes of a tv show by episode title.
The problem I have encountered, is actually creating the collection from the data.
The intent, is that I want to be able to have a collection of the items found within the for loop [a,b,c,d]; for each episode of the show.
Is a Numpy array the best way to go about making this collection, or should I use something else?
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
list = (seasonobj['episodes'])
for i in range(len(list)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
print(a, b, c, d)
print("----------------")
# np.append(season_array, [a,b,c,d]) this is not correct
ReadTheDarnJsonFile(season1)
print(season_array)
2 notes. First I would avoid using list as a variable name because it is a keyword in python. Second I would recommend using a custom class for your data for maximum readability.
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
class episode:
def __init__(self,title,seasonNumber,episodeNumber,plot):
self.title = title
self.seasonNumber = seasonNumber
self.episodeNumber = episodeNumber
self.plot = plot
def summary(self):
print("Season "+str(self.seasonNumber)+" Episode "+str(self.episodeNumber))
print(self.title)
print(self.plot)
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
episodes = (seasonobj['episodes'])
season_array = []
for i in range(len(episodes)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
season_array.append(episode(a,b,c,d)) this is not correct
return season_array
season_array = Read
TheDarnJsonFile(season1)
for item in season_array:
item.summary()
Here is what I ended up doing.
import json as j
import pandas as pd
emptyArray = []
season1 = open('THEJSONFILES\seasonone.json', 'r')
season2 = open('THEJSONFILES\seasontwo.json', 'r')
season3 = open('THEJSONFILES\seasonthree.json', 'r')
season4 = open('THEJSONFILES\seasonfour.json', 'r')
season5 = open('THEJSONFILES\seasonfive.json', 'r')
season6 = open('THEJSONFILES\seasonsix.json', 'r')
season7 = open('THEJSONFILES\seasonseven.json', 'r')
columnData = ["episodeTitle", "seasonIndex", "episodeIndex", "plot", "imageURL"]
finalDf = pd.DataFrame
def ReadTheDarnJsonFile(jsonTitle):
df = pd.DataFrame(columns = columnData)
seasonData = jsonTitle.read()
seasonObj = j.loads(seasonData)
currentSeasonList = (seasonObj['episodes'])
for i in range(len(currentSeasonList)):
tempTitle = str(currentSeasonList[i].get('title'))
tempSN = str(currentSeasonList[i].get('seasonNumber'))
tempEN = str(currentSeasonList[i].get('episodeNumber'))
tempPlot = str(currentSeasonList[i].get('plot'))
tempImage = str(currentSeasonList[i].get('image'))
dataObj = pd.Series([tempTitle, tempSN, tempEN, tempPlot, tempImage], index=(df.columns))
df.loc[i] = dataObj
emptyArray.append(df)
ReadTheDarnJsonFile(season1)
ReadTheDarnJsonFile(season2)
ReadTheDarnJsonFile(season3)
ReadTheDarnJsonFile(season4)
ReadTheDarnJsonFile(season5)
ReadTheDarnJsonFile(season6)
ReadTheDarnJsonFile(season7)
finalDf = pd.concat(emptyArray)
print(emptyArray)
holyOutput = finalDf.sort_values(by=['episodeTitle'])
holyOutput.reset_index(inplace=True)
holyOutput.to_json("P:\\ProjectForStarWarsCloneWarsJson\JSON\OutputJsonV2.json")

facing issue to add Data from dictionary into datafram

Facing trouble creating a function to store the response in columns,
Like passing the city name and as the response getting details of max_temp, min_temp, Pressure. which I want to store that in the new column.
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
api_key = {key from openweather(free)}
mgr = owm.weather_manager()
data =[]
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
Min_temp = l.temperature('celsius')['temp_min']
#Heat_index = l.heat_index
Humidity = l.humidity
Pressure = l.pressure['press']
weather = {"City": city,"Wind_Speed" : Wind_Speed, "Temp":
Temp,"Max_temp":Max_temp, "Min_temp":Min_temp, "Humidity":Humidity,
"Pressure":Pressure}
return weather
for city in df2['City']:
get_weather(city)
df = df.append(data, True)
Want to add each weather details as column based on city name
Want to create one function which stores all the details in columns,
don't want to create separate functions.
Data Frame is like:
You can return a dictionary from your fucntion.
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
resp=dict()
resp['Wind_Speed ']=Wind_Speed
resp['Temp']=Temp
resp['Max_temp']=Max_temp
return resp
df["Wind_speed"] = df["city"].apply(lambda x: get_weather(x)['Wind_Speed'])
df["Temp"] = df["city"].apply(lambda x: get_weather(x)['Temp'])
df["Max_temp"] = df["city"].apply(lambda x: get_weather(x)['Max_temp'])

Pandas Dataframe Only Returning first Row of JSON Data

I'm working on a web scraping project, and have all the right code that returns me the json data in the format that I want if I used the #print command below, but when I got to run the same code except through Pandas Dataframe it only returns the first row of Data that I'm looking for. Just running the print, it returns the expected 17 rows of data I'm looking for. Dataframe to CSV gives me the first row only. Totally stumped! So grateful for anyone's help!
for item in response['body']:
DepartureDate = item['legs'][0][0]['departDate']
ReturnDate = item['legs'][1][0]['departDate']
Airline = item['legs'][0][0]['airline']['code']
Origin = item['legs'][0][0]['depart']
Destination = item['legs'][0][0]['destination']
OD = (Origin + Destination)
TrueBaseFare = item['breakdown']['baseFareAmount']
YQYR = item['breakdown']['fuelSurcharge']
TAX = item['breakdown']['totalTax']
TTL = item['breakdown']['totalFareAmount']
MARKEDUPTTL = item['breakdown']['totalCalculatedFareAmount']
MARKUP = ((MARKEDUPTTL - TTL) / (TTL)*100)
FBC = item['fareBasisCode']
#print(DepartureDate,ReturnDate,Airline,OD,TrueBaseFare,YQYR,TAX,TTL,MARKEDUPTTL,MARKUP,FBC)
MI = pd.DataFrame(
{'Dept': [DepartureDate],
'Ret': [ReturnDate],
'AirlineCode': [Airline],
'Routing': [OD],
'RealFare': [TrueBaseFare],
'Fuel': [YQYR],
'Taxes': [TAX],
'RealTotal': [TTL],
'AgencyTotal': [MARKEDUPTTL],
'Margin': [MARKUP],
'FareBasis': [FBC],
})
df = pd.DataFrame(MI)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df.to_csv('MITest7.csv')
When you print all your values after the cycle, you will see that you get only the last values. To resolve this problem you need to create lists and put there your values.
Try this:
DepartureDate = []
ReturnDate = []
Airline = []
Origin = []
Destination = []
OD = []
TrueBaseFare = []
YQYR = []
TAX = []
TTL = []
MARKEDUPTTL = []
MARKUP = []
FBC = []
for item in response['body']:
DepartureDate.append(item['legs'][0][0]['departDate'])
ReturnDate.append(item['legs'][1][0]['departDate'])
Airline.append(item['legs'][0][0]['airline']['code'])
Origin.append(item['legs'][0][0]['depart'])
Destination.append(item['legs'][0][0]['destination'])
OD.append((Origin[-1] + Destination[-1]))
TrueBaseFare.append(item['breakdown']['baseFareAmount'])
YQYR.append(item['breakdown']['fuelSurcharge'])
TAX.append(item['breakdown']['totalTax'])
TTL.append(item['breakdown']['totalFareAmount'])
MARKEDUPTTL.append(item['breakdown']['totalCalculatedFareAmount'])
MARKUP.append(((MARKEDUPTTL[-1] - TTL[-1]) / (TTL[-1])*100))
FBC.append(item['fareBasisCode'])

Saving data from Arduino using Python - loss of data

With the help of web, i have created a code that collects the data form Arduino uno, and saves it to csv file.
The data collected are raw values of MEMS accelerometers.
The problem in code is that very often i loose a lot of data, if not all, if i terminate the Python. I noticed that at a random time, the output csv file has zero bytes.
Temporary solution is to start Arduino's "Serial monitor". This way most of the measured data is saved.
import serial
import time
import csv
import numpy as np
import pandas as pd
timeHr = []
timeT = []
mem1xD = []
mem1yD = []
mem1zD = []
#
mem2xD = []
mem2yD = []
mem2zD = []
arduinoData = serial.Serial('COM4',9600)
df = pd.DataFrame({
'timeHr':0,
'timeT':0,
'mem1xD':0,
'mem1yD':0,
'mem1zD':0,
'mem2xD':0,
'mem2yD':0,
'mem2zD':0,
},
index=[0]
)
while True:
while (arduinoData.inWaiting()==0):
pass
arduinoString = arduinoData.readline().decode("utf-8")
dataArray = arduinoString.split(",")
timehr = dataArray[0]
time = float(dataArray[1])/1000
mem1x = float(dataArray[2])
mem1y = float(dataArray[3])
mem1z = float(dataArray[4])
#
mem2x = float(dataArray[5])
mem2y = float(dataArray[6])
mem2z = float(dataArray[7])
timeHr.append(timehr)
timeT.append(time)
mem1xD.append(mem1x)
mem1yD.append(mem1y)
mem1zD.append(mem1z)
#
mem2xD.append(mem2x)
mem2yD.append(mem2y)
mem2zD.append(mem2z)
df = pd.DataFrame({
'timeHr':timeHr,
'timeT':timeT,
'mem1xD':mem1xD,
'mem1yD':mem1yD,
'mem1zD':mem1zD,
'mem2xD':mem2xD,
'mem2yD':mem2yD,
'mem2zD':mem2zD,
}
)
df.to_csv(r'time4.csv')
You need to append new data to your dataframe. Passing mode='a' in pd.Dataframe.to_csv will allow you to do that.
import time
tStart = str(time.time()).split('.')[0]
fileOut = tStart+'.csv'
while True:
while (arduinoData.inWaiting()==0):
pass
arduinoString = arduinoData.readline().decode("utf-8")
dataArray = arduinoString.split(",")
timehr = dataArray[0]
time = float(dataArray[1])/1000
mem1x = float(dataArray[2])
mem1y = float(dataArray[3])
mem1z = float(dataArray[4])
#
mem2x = float(dataArray[5])
mem2y = float(dataArray[6])
mem2z = float(dataArray[7])
timeHr.append(timehr)
timeT.append(time)
mem1xD.append(mem1x)
mem1yD.append(mem1y)
mem1zD.append(mem1z)
#
mem2xD.append(mem2x)
mem2yD.append(mem2y)
mem2zD.append(mem2z)
df = pd.DataFrame({
'timeHr':timeHr,
'timeT':timeT,
'mem1xD':mem1xD,
'mem1yD':mem1yD,
'mem1zD':mem1zD,
'mem2xD':mem2xD,
'mem2yD':mem2yD,
'mem2zD':mem2zD,
}
)
df.to_csv(fileOut,mode='a', header=False)

Categories

Resources