Cannot save modifications made in xlsx file - python

I read a .xlsx file, update it but Im not able to save it
from xml.dom import minidom as md
[... some code ....]
sheet = workDir + '/xl/worksheets/sheet'
sheet1 = sheet + '1.xml'
importSheet1 = open(sheet1,'r')
whole_file= importSheet1.read()
data_Sheet = md.parseString(whole_file)
[... some code ....]
self.array_mem_name = []
y = 1
x = 5 #first useful row
day = int(day)
found = 0
while x <= len_array_shared:
readrow = data_Sheet.getElementsByTagName('row')[x]
c_data = readrow.getElementsByTagName('c')[0]
c_attrib = c_data.getAttribute('t')
if c_attrib == 's':
vName = c_data.getElementsByTagName('v')[0].firstChild.nodeValue
#if int(vName) != broken:
mem_name = self.array_shared[int(vName)]
if mem_name != '-----':
if mem_name == old:
c_data = readrow.getElementsByTagName('c')[day]
c_attrib = c_data.getAttribute('t')
if (c_attrib == 's'):
v_Attrib = c_data.getElementsByTagName('v')[0].firstChild.nodeValue
if v_Attrib != '':
#loc = self.array_shared[int(v_Attrib)]
index = self.array_shared.index('--')
c_data.getElementsByTagName('v')[0].firstChild.nodeValue = index
with open(sheet1, 'w') as f:
f.write(whole_file)
As you can see I use f.write(whole_file) but whole_file has not the changes made with index.
Checking the debug I see that the new value has been added to the node, but I can't save sheet1 with the modified value

I switched to using openpyxl instead, as was suggested in a comment by Lei Yang. I found that this tool worked better for my jobs. With openpyxl, reading cell values is much easier than with xml.dom.minidom.
My only concern is that openpyxl seems really slower than the dom to load the workbook. Maybe the memory was overloaded. But, I was more interested in using something simpler than this minor performance issue.

Related

Skip First Column in CSV File with Pandas

I have a csv file that is generated that has some information in the first line. I'm trying to skip it but it doesn't seem to work. I tried looking at several suggestions and examples.
I tried using skiprows.
I also looked at several other examples.
Pandas drop first columns after csv read
https://datascientyst.com/pandas-read-csv-file-read_csv-skiprows/
Nothing I tried worked the way I wanted it.
When I got it to work it deleted the entire row.
Here is a sample of the code
# Imports the Pandas Module. It must be installed to run this script.
import pandas as pd
# Gets source file link
source_file = 'Csvfile.csv'
# Gets csv file and encodes it into a format that is compatible.
dataframe = pd.read_csv(source_copy, encoding='latin1')
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Grayscale, 'Duplex': dataframe.Duplex, 'Printer': dataframe.Printer})
# Formats data so that it can be used to count Duplex and Color pages.
df.loc[df["Duplex"] == "DUPLEX", "Duplex"] = dataframe.Pages
df.loc[df["Duplex"] == "NOT DUPLEX", "Duplex"] = 0
df.loc[df["Color"] == "NOT GRAYSCALE", "Color"] = dataframe.Pages
df.loc[df["Color"] == "GRAYSCALE", "Color"] = 0
df.sort_values(by=['User', 'Pages'])
file = df.to_csv('PrinterLogData.csv', index=False)
# Opens parsed CSV file.
output_source = "PrinterLogData.csv"
dataframe = pd.read_csv(output_source, encoding='latin1')
# Creates new DataFrame.
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Color, 'Duplex': dataframe.Duplex, 'Printer':
dataframe.Printer})
# Groups data by Users and Printer Sums
Report1 = df.groupby(['User'], as_index=False).sum().sort_values('Pages', ascending=False)
Report2 = (df.groupby(['Printer'], as_index=False).sum()).sort_values('Pages', ascending=False)
Sample Data
Sample Output of what I'm looking for.
This is an early draft of what you appear to want for your program (based on the simulated print-log.csv):
import csv
import itertools
import operator
import pathlib
CSV_FILE = pathlib.Path('print-log.csv')
EXTRA_COLUMNS = ['Pages', 'Grayscale', 'Color', 'Not Duplex', 'Duplex']
def main():
with CSV_FILE.open('rt', newline='') as file:
iterator = iter(file)
next(iterator) # skip first line if needed
reader = csv.DictReader(iterator)
table = list(reader)
create_report(table, 'Printer')
create_report(table, 'User')
def create_report(table, column_name):
key = operator.itemgetter(column_name)
table.sort(key=key)
field_names = [column_name] + EXTRA_COLUMNS
with pathlib.Path(f'{column_name} Report').with_suffix('.csv').open(
'wt', newline=''
) as file:
writer = csv.DictWriter(file, field_names)
writer.writeheader()
report = []
for key, group in itertools.groupby(table, key):
report.append({column_name: key} | analyze_group(group))
report.sort(key=operator.itemgetter('Pages'), reverse=True)
writer.writerows(report)
def analyze_group(group):
summary = dict.fromkeys(EXTRA_COLUMNS, 0)
for row in group:
pages = int(row['Pages']) * int(row['Copies'])
summary['Pages'] += pages
summary['Grayscale'] += pages if row['Grayscale'] == 'GRAYSCALE' else 0
summary['Color'] += pages if row['Grayscale'] == 'NOT GRAYSCALE' else 0
summary['Not Duplex'] += pages if row['Duplex'] == 'NOT DUPLEX' else 0
summary['Duplex'] += pages if row['Duplex'] == 'DUPLEX' else 0
return summary
if __name__ == '__main__':
main()

How to read XML data and write to Excel sheet in Python for Selenium Automation

I have a scenario where I need to get the data from XML file and write the same to Excel sheet and use the same sheet for data processing.
I am able to read the data from XML, but not able to insert the same data (records) to an excel file
I am using OpenPyExcel for this, please suggest any alternative and help me here.
I am not seeing any error though, but nothing is being written to excel sheet
import xml.etree.ElementTree as ET
import openpyexcel
tree = ET.parse("Test_Cust.xml")
root = tree.getroot()
workbook = openpyexcel.load_workbook("xml_excel.xlsx")
sheet = workbook["Sheet1"]
for items in root.iter():
if items.tag == "Email":
cust_email = items.text
elif items.tag == "CompanyName":
cust_cn = items.text
elif items.tag == "FirstName":
cust_fn = items.text
elif items.tag == "LastName":
cust_ln = items.text
rownum = (sheet.max_row)
print(rownum)
colnum = (sheet.max_column)
print(colnum)
for r in range(2, rownum+1):
for c in range(1, colnum+1):
sheet.cell(row = r, column = c).value = cust_email
sheet.cell(row=r, column=c).value = cust_email
sheet.cell(row=r, column=c).value = cust_email
sheet.cell(row=r, column=c).value = cust_email
workbook.save("xml_excel.xlsx")
print("Done")
Have you tried writing to the cell with this syntax?:
ws.cell(column=colnum,
row=rownum, value='mydata')
Note: you are saving the sheet in the inner loop. Slow!
Apart from that, this is a valid question: how can you easily read XML formatted data into OpenPYXL? But the question is more about parsing the incoming XML. That done, writing to OpenPYXL seems trivial.
(Oops, my mistake: my answer is for OpenPYXL, and may not work with openpyexcel)

Python Not Creating Excel File with xlsxwriter

I have an Excel file with items and descriptions and I'm trying to compare descriptions for similarity, and if they're similar, put them in a new Excel file. Those items also have Catalog #'s and I'm comparing them to see if they're nothing like each other and they're from the same vendor (buy_line) put them also on the new Excel file. When I run the file, it takes way too long and after I leave it running, I come back and Spyder is closed and no new file. So this is a 2 part question, is there a way to make the code faster? and why is there no file created? Thank you in advance. My code is below
`import xlrd
import xlsxwriter
from fuzzywuzzy import fuzz
AllItems = xlrd.open_workbook('2-18All_Items-CleanUp.xlsx','rb')
sheets = AllItems.sheet_names()
item = []
base = []
kit = []
buy_line = []
catalogs = []
descriptions = []
similar_desc_item = []
similar_desc = []
diff_catalog_samebuyline = []
sh = AllItems.sheet_by_index(0)
def readexcelfunc():
for rownum in range(sh.nrows):
row_values = sh.row_values(rownum)
item.append((row_values[0]))
base.append((row_values[1]))
kit.append((row_values[2]))
buy_line.append((row_values[6]))
catalogs.append((row_values[8]))
descriptions.append((row_values[12]))
def check_similar_desc():
for i,k in enumerate(descriptions):
for j,l in enumerate(descriptions):
ratio1 = fuzz.token_sort_ratio(k,l)
if ratio1 > 95 and k != l and base[i] != base[j] and kit[i] == "No":
similar_desc_item.append(item[i])
def check_notmatching_catalog():
for x,a in enumerate(catalogs):
for y,b in enumerate(catalogs):
ratio2 = fuzz.token_sort_ratio(a,b)
if ratio2 < 10 and buy_line[x] == buy_line[y]:
diff_catalog_samebuyline.append(catalogs[x])
def Create_ExcelFile():
NewWorkbook = xlsxwriter.Workbook('Sim_Desc.xlsx')
worksheet = NewWorkbook.add_worksheet()
row1 = 0
row2 = 0
for items in similar_desc_item:
worksheet.write(row1,0,items)
row1 += 1
for catalognumb in diff_catalog_samebuyline:
worksheet.write(row2,3,catalognumb)
NewWorkbook.save()
NewWorkbook.close()
readexcelfunc()
check_similar_desc()
print (similar_desc_item)
check_notmatching_catalog()
Create_ExcelFile()
print("Finished")`
There are a few issues in the Create_ExcelFile() function. The first is that there is no workbook save() method. Also you aren't incrementing row2, so the second write() will alway write to the first row, and overwrite whatever else is there. However, most importantly, the close() method is at the wrong level so you are closing the file too early. Something like this should work:
def Create_ExcelFile():
NewWorkbook = xlsxwriter.Workbook('Sim_Desc.xlsx')
worksheet = NewWorkbook.add_worksheet()
row1 = 0
row2 = 0
for items in similar_desc_item:
worksheet.write(row1,0,items)
row1 += 1
for catalognumb in diff_catalog_samebuyline:
worksheet.write(row2,3,catalognumb)
# Fix the row2 increment!!
NewWorkbook.close()

Append Function Nested Inside IF Statement Body Not Working

I am fairly new to Python (just started learning in the last two weeks) and am trying to write a script to parse a csv file to extract some of the fields into a List:
from string import Template
import csv
import string
site1 = 'D1'
site2 = 'D2'
site3 = 'D5'
site4 = 'K0'
site5 = 'K1'
site6 = 'K2'
site7 = '0'
site8 = '0'
site9 = '0'
lbl = 1
portField = 'y'
sw = 5
swpt = 6
cd = 0
pt = 0
natList = []
with open(name=r'C:\Users\dtruman\Documents\PROJECTS\SCRIPTING - NATAERO DEPLOYER\NATAERO DEPLOYER V1\nataero_deploy.csv') as rcvr:
for line in rcvr:
fields = line.split(',')
Site = fields[0]
siteList = [site1,site2,site3,site4,site5,site6,site7,site8,site9]
while Site in siteList == True:
Label = fields[lbl]
Switch = fields[sw]
if portField == 'y':
Switchport = fields[swpt]
natList.append([Switch,Switchport,Label])
else:
Card = fields[cd]
Port = fields[pt]
natList.append([Switch,Card,Port,Label])
print natList
Even if I strip the ELSE statement away and break into my code right after the IF clause-- i can verify that "Switchport" (first statement in IF clause) is successfully being populated with a Str from my csv file, as well as "Switch" and "Label". However, "natList" is not being appended with the fields parsed from each line of my csv for some reason. Python returns no errors-- just does not append "natList" at all.
This is actually going to be a function (once I get the code itself to work), but for now, I am simply setting the function parameters as global variables for the sake of being able to run it in an iPython console without having to call the function.
The "lbl", "sw", "swpt", "cd", and "pt" refer to column#'s in my csv (the finished function will allow user to enter values for these variables).
I assume I am running into some issue with "natList" scope-- but I have tried moving the "natList = []" statement to various places in my code to no avail.
I can run the above in a console, and then run "append.natList([Switch,Switchport,Label])" separately and it works for some reason....?
Thanks for any assistance!
It seems to be that the while condition needs an additional parenthesis. Just add some in this way while (Site in siteList) == True: or a much cleaner way suggested by Padraic while Site in siteList:.
It was comparing boolean object against string object.
Change
while Site in siteList == True:
to
if Site in siteList:
You might want to look into the csv module as this module attempts to make reading and writing csv files simpler, e.g.:
import csv
with open('<file>') as fp:
...
reader = csv.reader(fp)
if portfield == 'y':
natlist = [[row[i] for i in [sw, swpt, lbl]] for row in fp if row[0] in sitelist]
else:
natlist = [[row[i] for i in [sw, cd, pt, lbl]] for row in fp if row[0] in sitelist]
print natlist
Or alternatively using a csv.DictReader which takes the first row as the fieldnames and then returns dictionaries:
import csv
with open('<file>') as fp:
...
reader = csv.DictReader(fp)
if portfield == 'y':
fields = ['Switch', 'card/port', 'Label']
else:
fields = ['Switch', '??', '??', 'Label']
natlist = [[row[f] for f in fields] for row in fp if row['Building/Site'] in sitelist]
print natlist

Modify location of a genbank feature

Edit : I know feature.type will give gene/CDS and feature.qualifiers will then give "db_xref"/"locus_tag"/"inference" etc. Is there a feature. object which will allow me to access the location (eg: [5240:7267](+) ) directly?
This URL give a bit more info, though I can't figure out how to use it for my purpose... http://biopython.org/DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html#location_operator
Original Post:
I am trying to modify the location of features within a GenBank file. Essentially, I want to modify the following bit of a GenBank file:
gene 5240..7267
/db_xref="GeneID:887081"
/locus_tag="Rv0005"
/gene="gyrB"
CDS 5240..7267
/locus_tag="Rv0005"
/inference="protein motif:PROSITE:PS00177"
...........................
to
gene 5357..7267
/db_xref="GeneID:887081"
/locus_tag="Rv0005"
/gene="gyrB"
CDS 5357..7267
/locus_tag="Rv0005"
/inference="protein motif:PROSITE:PS00177"
.............................
Note the changes from 5240 to 5357
So far, from scouring the internet and Stackoverflow, I have:
from Bio import SeqIO
gb_file = "mtbtomod.gb"
gb_record = SeqIO.parse(open(gb_file, "r+"), "genbank")
rvnumber = 'Rv0005'
newstart = 5357
final_features = []
for record in gb_record:
for feature in record.features:
if feature.type == "gene":
if feature.qualifiers["locus_tag"][0] == rvnumber:
if feature.location.strand == 1:
feature.qualifiers["amend_position"] = "%s:%s" % (newstart, feature.location.end+1)
else:
# do the reverse for the complementary strand
final_features.append(feature)
record.features = final_features
with open("testest.gb","w") as testest:
SeqIO.write(record, testest, "genbank")
This basically creates a new qualifier called "amend_position".. however, what I would like to do is modify the location directly (with or without creating a new file...)
Rv0005 is just an example of a locus_tag I need to update. I have about 600 more locations to update, which explains the need for a script.. Help!
Ok, I have something which now fully works. I'll post the code in case anyone ever needs something similar
__author__ = 'Kavin'
from Bio import SeqIO
from Bio import SeqFeature
import xlrd
import sys
import re
workbook = xlrd.open_workbook(sys.argv[2])
sheet = workbook.sheet_by_index(0)
data = [[sheet.cell_value(r, c) for c in range(sheet.ncols)] for r in range(sheet.nrows)]
# Create dicts to store TSS data
TSS = {}
row = {}
# For each entry (row), store the startcodon and strand information
for i in range(2, sheet.nrows - 1):
if data[i][5] < -0.7: # Ensures BASS score is within significant range
Gene = data[i][0]
row['Direction'] = str(data[i][3])
row['StartCodon'] = int(data[i][4])
TSS[str(Gene)] = row
row = {}
else:
i += 1
# Create an output filename based on input filename
outfile_init = re.search('(.*)\.(\w*)', sys.argv[1])
outfile = str(outfile_init.group(1)) + '_modified.' + str(outfile_init.group(2))
final_features = []
for record in SeqIO.parse(open(sys.argv[1], "r"), "genbank"):
for feature in record.features:
if feature.type == "gene" or feature.type == "CDS":
if TSS.has_key(feature.qualifiers["locus_tag"][0]):
newstart = TSS[feature.qualifiers["locus_tag"][0]]['StartCodon']
if feature.location.strand == 1:
feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(newstart - 1),
SeqFeature.ExactPosition(
feature.location.end.position),
feature.location.strand)
else:
feature.location = SeqFeature.FeatureLocation(
SeqFeature.ExactPosition(feature.location.start.position),
SeqFeature.ExactPosition(newstart), feature.location.strand)
final_features.append(feature) # Append final features
record.features = final_features
with open(outfile, "w") as new_gb:
SeqIO.write(record, new_gb, "genbank")
This assumes usage such as python program.py <genbankfile> <excel spreadsheet>
This also assumes a spreadsheet of the following format:
Gene Synonym Tuberculist_annotated_start Orientation Re-annotated_start BASS_score
Rv0005 gyrB 5240 + 5357 -1.782
Rv0012 Rv0012 14089 + 14134 -1.553
Rv0018c pstP 23181 - 23172 -2.077
Rv0032 bioF2 34295 + 34307 -0.842
Rv0037c Rv0037c 41202 - 41163 -0.554
So, you can try something like below. As the number of change will be equal to the number of CDS/genes found in the file. You can read the locations/positions from csv/text file and make a list like I manually made change_values.
import re
f = open("test.txt")
change_values=["1111", "2222"]
flag = True
next = 0
for i in f.readlines():
if i.startswith(' CDS') or i.startswith(' gene'):
out = re.sub(r"\d+", str(change_values[next]), i)
#Instead of print write
print out
flag = not flag
if flag==True:
next += 1
else:
#Instead of print write
print i
Amy sample test.txt file looks like this:
gene 5240..7267
/db_xref="GeneID:887081"
/locus_tag="Rv0005"
/gene="gyrB"
CDS 5240..7267
/locus_tag="Rv0005"
/inference="protein motif:PROSITE:PS00177"
...........................
gene 5240..7267
/db_xref="GeneID:887081"
/locus_tag="Rv0005"
/gene="gyrB"
CDS 5240..7267
/locus_tag="Rv0005"
/inference="protein motif:PROSITE:PS00177"
...........................
Hope, this will solve your issue. Cheers!
I think this can be done with native biopython synthax, no regex
needed, minimal working example here:
from Bio import SeqIO
from Bio import SeqFeature
import copy
gbk = SeqIO.read('./test_gbk', 'gb')
index = 1
feature_to_change = copy.deepcopy(gbk.features[index]) #depends what feature you want to change,
#can also be done with loop if you want to change them all, or by some function...
new_start = 0
new_end = 100
new_feature_location = SeqFeature.FeatureLocation(new_start, new_end, feature.location.strand) #create a new feature location object
feature_to_change.location = new_feature_location #change old feature location
del gbk.features[index] #remove changed feature
gkb.features.append(feature_to_change) #add identical feature with new location
gbk.features = sorted(gbk.features, key = lambda feature: feature.location.start) # if you want them sorted by the start of the location, which is the usual case
SeqIO.write(gbk, './test_gbk_with_new_feature', 'gb')

Categories

Resources