Loop through multiple xml files - python

I'm fairly new to python and would like to loop through multiple xml files. I'm currently using the existing code to pull in sample2 xml file:
import xml.etree.ElementTree as ET
import pandas as pd
import os
tree=ET.parse("sample2.xml")
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]
I tried adding soup xml to the below line of code but this didn't work
tree=ET.parse("sample2.xml , "soup xml")
root = tree.getroot()

Consider turning your code into a function and calling it for the various files you need:
import xml.etree.ElementTree as ET
import pandas as pd
import os
def my_xml_processor(filename):
tree=ET.parse(filename)
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]
return df2
You can then call it for your files:
my_xml_processor("sample2.xml")
my_xml_processor("soup.xml")
EDIT: these are some minor code changes that I'd recommend:
import xml.etree.ElementTree as ET
import pandas as pd
import os
def my_xml_processor(filename:str)->pd.DataFrame: # <- Add type hints
root = ET.parse(filename).getroot() # <- tree is not used
qty = root.iterfind(".//Qty")
pri = root.iterfind(".//PriceAmount")
cor = root.iterfind(".//AuctionIdentification")
data = [ # <- This could be a list comprehension
(x.get('v'), y.get('v'), z.get('v'))
for x,y,z in zip(qty, pri, cor)
]
df = (pd
.DataFrame(data, columns=["Qty", "Price" , "Border"])
.astype({
'Qty': float,
'Price': float,
})
)
df2 = df.agg({
'Qty':'sum',
'Price':'mean',
'Border': lambda x: str(x[0])[:12]
}).to_frame().T
return df2

You could use your existing code, but running it in a loop for each filename you have, something like:
import xml.etree.ElementTree as ET
import pandas as pd
import os
files = ['sample2.xml', 'sample3.xml', 'sample4.xml']
for file in files: #read each filename from above list
tree=ET.parse(file)
root = tree.getroot()
qty=root.iterfind(".//Qty")
pri=root.iterfind(".//PriceAmount")
cor=root.iterfind(".//AuctionIdentification")
data =[]
for x, y, z in zip(qty, pri, cor):
#print(x.get("v"), y.get("v"))
a = x.get("v"), y.get("v"), z.get("v")
data.append(a)
df = pd.DataFrame(data, columns=["Qty", "Price" , "Border"])
df['Qty'] = df['Qty'].astype(float)
df['Price'] = df['Price'].astype(float)
#print(df)
total = df['Qty'].sum()
price = df['Price'].mean()
border = df.loc[0,'Border']
df2 = pd.DataFrame(columns=["Qty", "Price" , "Border"])
df2['Qty'] = [total]
df2['Price'] = [price]
df2['Border'] = [str(border)[0:12]]

Related

Convert excel to XML in python

I am trying to convert excel database into python.
I have a trading data which I need to import into the system in xml format.
my code is following:
df = pd.read_excel("C:/Users/junag/Documents/XML/Portfolio2.xlsx", sheet_name="Sheet1", dtype=object)
root = ET.Element('trading-data')
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
tree = ET.ElementTree(root)
Portfolios = ET.SubElement(root, "Portfolios")
Defaults = ET.SubElement(Portfolios, "Defaults", BaseCurrency="USD")
for row in df.itertuples():
Portfolio = ET.SubElement(Portfolios, "Portfolio", Name=row.Name, BaseCurrency=row.BaseCurrency2, TradingPower=str(row.TradingPower),
ValidationProfile=row.ValidationProfile, CommissionProfile=row.CommissionProfile)
PortfolioPositions = ET.SubElement(Portfolio, "PortfolioPositions")
if row.Type == "Cash":
PortfolioPosition = ET.SubElement(PortfolioPositions, "PortfolioPosition", Type=row.Type, Volume=str(row.Volume))
Cash = ET.SubElement(PortfolioPosition, 'Cash', Currency=str(row.Currency))
else:
PortfolioPosition = ET.SubElement(PortfolioPositions, "PortfolioPosition", Type=row.Type, Volume=str(row.Volume),
Invested=str(row.Invested), BaseInvested=str(row.BaseInvested))
Instrument = ET.SubElement(PortfolioPosition, 'Instrument', Ticker=str(row.Ticker), ISIN=str(row.ISIN), Market=str(row.Market),
Currency=str(row.Currency2), CFI=str(row.CFI))
ET.indent(tree, space="\t", level=0)
tree.write("Portfolios_converted2.xml", encoding="utf-8")
The output looks like this:
enter image description here
While I need it to look like this:
enter image description here
How can I improve my code to make the output xml look better? please advise
here the excel data:
Since you need a single <Portfolio> and <PortfolioPositions> as parent grouping, consider a nested loop by iterating through a list of data frames. Then, within each data frame loop through its rows:
import xml.etree.ElementTree as ET
import pandas as pd
import xml.dom.minidom as md
df = pd.read_excel("Input.xlsx", sheet_name="Sheet1", dtype=object)
# LIST OF DATA FRAME SPLITS
df_list = [g for i,g in df.groupby(
["Name", "BaseCurrency2", "TradingPower", "ValidationProfile", "CommissionProfile"]
)]
# ROOT LEVEL
root = ET.Element('trading-data')
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
# ROOT CHILD LEVEL
Portfolios = ET.SubElement(root, "Portfolios")
Defaults = ET.SubElement(Portfolios, "Defaults", BaseCurrency="USD")
# GROUP LEVEL ITERATION
for df in df_list:
Portfolio = ET.SubElement(
Portfolios,
"Portfolio",
Name = df["Name"][0],
BaseCurrency = df["BaseCurrency2"][0],
TradingPower = str(df["TradingPower"][0]),
ValidationProfile = df["ValidationProfile"][0],
CommissionProfile = df["CommissionProfile"][0]
)
PortfolioPositions = ET.SubElement(Portfolio, "PortfolioPositions")
# ROW LEVEL ITERATION
for row in df.itertuples():
if row.Type == "Cash":
PortfolioPosition = ET.SubElement(
PortfolioPositions,
"PortfolioPosition",
Type = row.Type,
Volume = str(row.Volume)
)
Cash = ET.SubElement(
PortfolioPosition,
"Cash",
Currency = str(row.Currency)
)
else:
PortfolioPosition = ET.SubElement(
PortfolioPositions,
"PortfolioPosition",
Type = row.Type,
Volume = str(row.Volume),
Invested = str(row.Invested),
BaseInvested = str(row.BaseInvested)
)
Instrument = ET.SubElement(
PortfolioPosition,
"Instrument",
Ticker = str(row.Ticker),
ISIN = str(row.ISIN),
Market = str(row.Market),
Currency = str(row.Currency2),
CFI = str(row.CFI)
)
# SAVE PRETTY PRINT OUTPUT
with open("Output.xml", "wb") as f:
dom = md.parseString(ET.tostring(root))
f.write(dom.toprettyxml().encode("utf-8"))
Converting excel to XML in python
import openpyxl
import xml.etree.ElementTree as ET
def convert_excel_to_xml(file_name, sheet_name):
wb = openpyxl.load_workbook(file_name)
sheet = wb[sheet_name]
root = ET.Element("root")
for row in sheet.rows:
for cell in row:
ET.SubElement(root, "cell", value=cell.value)
tree = ET.ElementTree(root)
tree.write("{}.xml".format(sheet_name))
Run the function
convert_excel_to_xml("test.xlsx", "Sheet1")

How to use Dask for with .groupby.apply?

I have dataframe df
I would like to partition df into sub-dataframes and apply function find_root on each of them. My function only takes columns id and parent_id as input.
Then I would like to concatenate resulted dataframes. Because my dataframe is huge (over 4 million rows), I would like to use Dask. Then I have an error
ValueError: The columns in the computed data do not match the columns in the provided metadata
Extra: []
Missing: [2]
Could you please elaborate on how to solve this error?
import pandas as pd
import networkx as nx
from dask.distributed import Client
import dask.dataframe as dd
client = Client(n_workers=2, threads_per_worker=1, processes=False, memory_limit='4GB')
def find_root(df):
g = nx.from_pandas_edgelist(df, source = 'parent_id', target = 'id', create_using = nx.DiGraph())
roots = {n for n, d in g.in_degree() if d == 0}
tmp = {}
for r in roots:
tree = dfs_tree(g, r)
tmp[r] = list(tree.nodes)
tmp = pd.DataFrame.from_dict(tmp, orient = 'index').T
tmp = tmp.melt(value_name = 'node', var_name = 'root').dropna()
return tmp
path = 'https://raw.githubusercontent.com/leanhdung1994/WebMining/main/sample_df.csv'
df = dd.read_csv(path, header = 0)
df = df[['id', 'created_utc', 'ups', 'link_id', 'author', 'body', 'parent_id']]
df['parent_id'] = df['parent_id'].str.split('_', expand = True, n = 2)[1]
df['link_id'] = df['link_id'].str.split('_', expand = True, n = 2)[1]
result = df.groupby('link_id').apply(find_root, meta = object)
computed_result = result.compute()
Update: I added dtype to dd.read_csv
df = dd.read_csv(path, header = 0, dtype = {'id': 'str', 'parent_id': 'str', 'link_id': 'str'})
but the problem persists.

how to append a data at a list and plot it with python plotly

hi there i m still trying to get trade bot and try to plot them with their time and low price data.
i wanna get buy signals that i ve specified at if condition (when macdh turns from negative to positive). then i want to plot them at a data. but can not add them at buy_signal=[] place.
my error is
self.plotData(buy_signals = buy_signals)
IndexError: list index out of range
import requests
import json
from stockstats import StockDataFrame as Sdf
import plotly.graph_objects as go
from plotly.offline import plot
class TradingModel:
def __init__(self, symbol):
self.symbol = symbol
self.df = self.getData
def getData(self):
# define URL
base = 'https://api.binance.com'
endpoint = '/api/v3/klines'
params = '?&symbol='+self.symbol+'&interval=4h'
url = base + endpoint + params
# download data
data = requests.get(url)
dictionary = data.json()
# put in dataframe and clean-up
df = pd.DataFrame.from_dict(dictionary)
df = df.drop(range(6, 12), axis=1)
# rename columns and stockstasts
col_names = ['time', 'open', 'high', 'low', 'close', 'volume']
df.columns = col_names
stock = Sdf.retype(df)
for col in col_names:
df[col]=df[col].astype(float)
#defined macdh
df['macdh']=stock['macdh']
return (df)
def strategy(self):
df = self.df
buy_signals=[]
for i in range(1, len(df['close'])):
if df['macdh'].iloc[-1]>0 and df['macdh'].iloc[-2]<0:
buy_signals.append([df['time'][i], df['low'][i]])
self.plotData(buy_signals = buy_signals)
def plotData(self,buy_signal=False):
df=self.df
candle=go.Candlestick(
x=df['time'],
open=df['open'],
close=df['close'],
high=df['high'],
low=df['low'],
name="Candlesticks"
)
macdh=go.Scatter(
x=df['time'],
y=df['macdh'],
name="Macdh",
line = dict(color=('rgba(102, 207, 255, 50)')))
Data=[candle,macdh]
if buy_signals:
buys = go.Scatter(
x = [item[0] for item in buy_signals],
y = [item[1] for item in buy_signals],
name = "Buy Signals",
mode = "markers",
)
sells = go.Scatter(
x = [item[0] for item in buy_signals],
y = [item[1]*1.04 for item in buy_signals],
name = "Sell Signals",
mode = "markers",
)
data = [candle, macdh, buys, sells]
# style and display
layout = go.Layout(title = self.symbol)
fig = go.Figure(data = data, layout = layout)
plot(fig, filename=self.symbol)
def Main():
symbol = "BTCUSDT"
model = TradingModel(symbol)
model.strategy()
if __name__ == '__main__':
Main() ```
You need to replace :
self.plotData(buy_signals[i]) by self.plotData(buy_signals)
def plotData(self,buy_signal=False): by def plotData(self,buy_signals=None):
And it should be good to go !

merging .xlsx files into one without overwriting data

import os
import tkinter as tk
from tkinter import filedialog
import pandas as pd
import xlrd
import openpyxl
from openpyxl import load_workbook
import datetime
from dataclasses import dataclass
from openpyxl.styles import Font,Color,Alignment,Border,Side,colors
import numpy as np
from xlsxwriter.utility import xl_rowcol_to_cell
import xlwt
from xlwt import Workbook
import functools
import numpy as np
from itertools import repeat, chain
import glob
root= tk.Tk()
canvas1 = tk.Canvas(root, width = 300, height = 300, bg = 'lightsteelblue')
canvas1.pack()
def getExcel():
global df
import_file_path = filedialog.askopenfilename()
df = pd.read_excel (import_file_path)
del df['PART CODE']
del df['SUPPLIER CODE']
del df['COMPANY OR SUB-CONT']
del df['SUB-CONT UNIT COST']
del df['HLR']
del df['HOURLY MULTIPLIER']
del df['MATERIAL MULTIPLIER']
del df ['ROUGH-IN HOURS']
del df['FINISH HOURS']
del df['PRELIMINARY TEXT']
del df['FORMAL TEXT']
del df['SUBCONT TEXT']
del df['Part Image Path']
df.insert(0,'Id','')
df.insert(1,'M','M')
df.insert(3,'SubCategory', '')
df.insert(4,'DrillDowns', '')
df.insert(6, 'Name', '')
df = df.rename(columns={'PART UNIT TYPE': 'MeasurementType'})
df.insert(8,'OnOffSwitch', 'No')
df['SubCategory'] = df['CATEGORY']
df = df.rename(columns={'PART DESCRIPTION': 'Note'})
df['Name'] = df['Note']
df = df.rename(columns={'COMPANY UNIT COST': 'PRICE'})
new_row = pd.DataFrame({'Id':'(BLANK = NEW)', 'M':'P', 'CATEGORY':'Brand',
'SubCategory':'Name', 'DrillDowns':'Price - Bathroom {nwPFGtikvZ}', 'Name':'Price - HVAC {cp7lAPx4IO}', 'Note': 'Price - XPS1 {qX8FFEVmqP}',
'MeasurementType':'Price - PRIME {atGoZ7zLsE}', 'OnOffSwitch':'Price - ARCHIVE {NtbEEROpa9}', 'NeedToReplace':'Price - FLOORING {AskrHJL9ab}', 'NeedToReplace1':'Price - TEST {jOn0TaUDmU}', 'NeedToReplace2':'Price - Kitchen Refacing {9iFFUgrQBr}', 'NeedToReplace3':'Price - EAGLE EYES {X8ExSUDoFH}', 'NeedToReplace4':'Price - Basement {ajuemFbXaL}', 'NeedToReplace5': 'Price - Egress Windows {69790nzjKb}'},
index =[0])
# simply concatenate both dataframes
df = pd.concat([new_row, df]).reset_index(drop = True)
df = df.fillna('')
#new_row1 = pd.DataFrame({'Id':' ', 'M':'P', 'CATEGORY': 'SITE-PREP'}, index= [2])
switches = df['M'].ne(df['M'].shift(16000))
idx = switches[switches].index
df_new = pd.DataFrame(index=idx + 1.5)
df = pd.concat([df, df_new]).sort_index()
#df = pd.concat([new_row1, df]).reset_index(drop = True)
df = df.fillna('NO VALUE')
df.M = df.M.replace({'NO VALUE': "P"})
df.Id = df.Id.replace({'NO VALUE': ""})
df.DrillDowns = df.DrillDowns.replace({'NO VALUE': "xxxx"})
df.Name = df.Name.replace({'NO VALUE': "xxxx"})
df.Note = df.Note.replace({'NO VALUE': "xxxx"})
df.MeasurementType = df.MeasurementType.replace({'NO VALUE': "xxxx"})
df.OnOffSwitch = df.OnOffSwitch.replace({'NO VALUE': "xxxx"})
df.NeedToReplace = df.NeedToReplace.replace({'NO VALUE': "xxxx"})
df.NeedToReplace1 = df.NeedToReplace1.replace({'NO VALUE': "xxxx"})
df.NeedToReplace2 = df.NeedToReplace2.replace({'NO VALUE': "xxxx"})
df.NeedToReplace3 = df.NeedToReplace3.replace({'NO VALUE': "xxxx"})
df.NeedToReplace5 = df.NeedToReplace3.replace({'NO VALUE': "xxxx"})
df['NeedToReplace4'] = df['PRICE'].shift(1)
df = df.fillna("Price - Basement {ajuemFbXaL}")
df.NeedToReplace4 = df.NeedToReplace4.replace({'NO VALUE': ""})
del df['PRICE']
df = df.rename(columns={'NeedToReplace': ''})
df = df.rename(columns={'NeedToReplace1': ' '})
df = df.rename(columns={'NeedToReplace2': ' '})
df = df.rename(columns={'NeedToReplace3': ' '})
df = df.rename(columns={'NeedToReplace4': ' '})
df = df.rename(columns={'NeedToReplace5': ' '})
df.CATEGORY.replace('NO VALUE',df.Note.shift(1),inplace=True)
df.SubCategory.replace('NO VALUE',df.Name.shift(1),inplace=True)
pd.set_option('display.max_rows', df.shape[0]+1)
pd.set_option('display.max_colwidth', None)
browseButton_Excel = tk.Button(text='Select Excel File', command=getExcel, bg='green', fg='white', font=('helvetica', 12, 'bold'))
canvas1.create_window(150, 150, window=browseButton_Excel)
root.mainloop()
df.reset_index(drop=True)
df.to_excel(r'C:\Users\Larso\Desktop\ClearEstimatesEstimate\LeapPriceGuideExport.xlsx', sheet_name='Price Guide', index = False)
file1 = pd.read_excel("LeapPriceGuideExport.xlsx")
file2 = pd.read_excel("test.xlsx")
file3 = file1.merge(file2, on="ID", how="outer")
file3.to_excel("merged.xlsx")
This is my current code above, im editing an excel file to go into an existing .xlsx file, but every time i try to merge the files i get error code
FileNotFoundError: [Errno 2] No such file or directory: 'LeapPriceGuideExport.xlsx'
i clearly see the file in the folder, but not sure what i am missing, thank you, and please let me know if things didnt come through clearly, i am stilling getting used to posting and asking questions, thank you.

print multiple output one by one from index

import re
import pandas as pd
import sqlite3
connection = sqlite3.connect('C:\Users\wh112\Desktop\BRAIN.sqlite')
df1 = pd.read_sql('select * from PRONOUN',connection)
df2 = pd.read_sql('select * from VERB',connection)
df3 = pd.read_sql('select * from QUESTIONS',connection)
df = pd.concat([df1,df2,df3])
def word_list(text):
return list(filter(None, re.split('\W+', text)))
session = raw_input("Test on me!")
feedback = session
print(word_list(feedback))
dff = pd.DataFrame({'Sentence':[feedback]})
dff['1'] = dff['Sentence'].astype(str).str.split().str[0]
dff['2'] = dff['Sentence'].astype(str).str.split().str[1]
dff['3'] = dff['Sentence'].astype(str).str.split().str[2]
for pts1 in dff['1']:
pts1 = df.columns[df.isin([pts1]).any()]
for pts2 in dff['2']:
pts2 = df.columns[df.isin([pts2]).any()]
for pts3 in dff['3']:
pts3 = df.columns[df.isin([pts3]).any()]
Now the Topic is on:
pts1 = df.columns[df.isin([pts1]).any()]
when I am working with this to find any match from database string; multiple match shows together on output. like this:
Index([u'auxiliary_verbfirst_person_singular_pronounnon_wh_type'])
But I want them to be one by one like:
Index([u'auxiliary_verb, first_person_singular_pronoun,non_wh_type'])
is there any way to do this, can you help me, please?

Categories

Resources