I have hundreds of XML files that I need to extract two values from and ouput in an Excel or CSV file. This is the code I currently have:
#grabs idRoot and typeId root values from XML files
import glob
from openpyxl import Workbook
from xml.dom import minidom
import os
wb = Workbook()
ws = wb.active
def typeIdRoot (filename):
f = open(filename, encoding = "utf8")
for xml in f:
xmldoc = minidom.parse(f)
qmd = xmldoc.getElementsByTagName("MainTag")[0]
typeIdElement = qmd.getElementsByTagName("typeId")[0]
root = typeIdElement.attributes["root"]
global rootValue
rootValue = root.value
print ('rootValue =' ,rootValue,)
ws.append([rootValue])
wb.save("some.xlsx")
wb = Workbook()
ws = wb.active
def idRoot (filename):
f = open(filename, encoding = "utf8")
for xml in f:
xmldoc = minidom.parse(f)
tcd = xmldoc.getElementsByTagName("MainTag")[0]
activitiesElement = tcd.getElementsByTagName("id")[0]
sport = activitiesElement.attributes["root"]
sportName = sport.value
print ('idRoot =' ,sportName,)
ws.append([idRoot])
wb.save("some.xlsx")
for file in glob.glob("*.xml"):
typeIdRoot (file)
for file in glob.glob("*.xml"):
idRoot (file)
The first value follows a 1.11.111.1.111111.1.3 format. The second mixes letters and numbers. I believe this is the reason for the error:
Traceback (most recent call last):
File "C:\Python34\Scripts\xml\good.py", line 64, in <module>
idRoot (file)
File "C:\Python34\Scripts\xml\good.py", line 54, in idRoot
ws.append([idRoot])
File "C:\Python34\lib\site-packages\openpyxl\worksheet\worksheet.py", line 754, in append
cell = self._new_cell(col, row_idx, content)
File "C:\Python34\lib\site-packages\openpyxl\worksheet\worksheet.py", line 376, in _new_cell
cell = Cell(self, column, row, value)
File "C:\Python34\lib\site-packages\openpyxl\cell\cell.py", line 131, in __init__
self.value = value
File "C:\Python34\lib\site-packages\openpyxl\cell\cell.py", line 313, in value
self._bind_value(value)
File "C:\Python34\lib\site-packages\openpyxl\cell\cell.py", line 217, in _bind_value
raise ValueError("Cannot convert {0} to Excel".format(value))
ValueError: Cannot convert <function idRoot at 0x037D24F8> to Excel
I would like the result to add both values on the same row. So then I would have a new row for each file in the directory. I need to add the second value to the second row.
as such:
Value 1 Value 2
1.11.111.1.111111.1.3 10101011-0d10-0101-010d-0dc1010e0101
idRoot is the name of your FUNCTION.
So when you write
ws.append([idRoot])
you probably mean:
ws.append([sportName])
Of course, you can write something like:
ws.append([rootValue, sportName])
providing both variables are defined with reasonable values.
One last thing, you should save your file only once.
Related
I am trying to read a text file line by line and then print it to an excel sheet line by line
Here is what I have so far
for x in ABC:
print(f"{x}:")
sheet1[cellLocLastRow('A')] = f"{x}:"
try:
with open(f"./{x}/Log.txt") as f:
textRead= (f.read())
print(textRead)
sheet1[cellLocLastRow('A')] = textRead
except FileNotFoundError:
print("File does not exist")
sheet1[cellLocLastRow('A')] = "File does not exist"
It prints it out the text file to the excel sheet but all in one row like this
1
But I would like my text file to be printed out like this
2
If you were wondering why I am using [cellLocLastRow('A')] , I am using that instead of a [A17] because I am printing out unknown lengths of documents into an excel sheet and so it counts the rows.
def cellLocLastRow(colChar):
global lastRow
curRow = lastRow
lastRow += 1
return cellLoc(colChar, curRow)
The text file format is as follows:
TestName: TestName
TestName: Info::Info::Info::Info::f###::##.#ns
Total Errors: #
TestName: Info::Info::Info::Info::f###::##.#ns
Total Errors: #
TestName: Info::Info::Info::Info::f###::##.#ns
Total Errors: #
Did you tried f.readlines() method?
with open(text, 'r') as f:
content1 = f.readlines()
This script will return a list with all file's lines, then you can do whatever you want comfortably.
this is pretty easy with pylightxl
pip install pylightxl
lines = []
with open(“textfile.txt”) as f:
line = f.readline()
if not line:
break
lines.append(line)
import pylightxl as xl
db = xl.Database()
db.add_ws("Sheet1", {})
for i, line in enumerate(lines, start=1):
db.ws("Sheet1").update_index(i, 1,line)
xl.writexl(db, “output.xlsx”)
I have an .xlsx file which contains 2 worksheets. The first one contains regular data (nothing fancy), while the second one contains pivot tables. I need only the data form the first worksheet and I want to ignore the second one, but the pivot tables raise error: TypeError: expected <type 'basestring'> when openpyxl.load_workbook is called.
The error is raised in: openpyxl/reader/excel.py, in line: pivot_caches = parser.pivot_caches.
I tried with openpyxl versions 2.6.4 and 2.5.1. I'm using Python 2.7.
After deleting the 2nd worksheet, the error is gone and the data from the 1sth worksheet is read correctly. However, these files are uploaded by users and although I don't need the pivot tables, I would like to avoid forcing users to remove the unnecessary worksheet(s), if possible.
Sample code:
from io import BytesIO
import openpyxl
pivot = '~/Downloads/file_with_pivot_tables.xlsx'
with open(pivot) as fin:
content = BytesIO(fin.read())
wb = openpyxl.load_workbook(content) # this line fails
ws = wb.get_sheet_by_name('Sheet1')
Entire error trace:
File "/Users/gi/lib/openpyxl/reader/excel.py", line 224, in load_workbook
pivot_caches = parser.pivot_caches
File "/Users/gi/lib/openpyxl/packaging/workbook.py", line 125, in pivot_caches
cache = get_rel(self.archive, self.rels, id=c.id, cls=CacheDefinition)
File "/Users/gi/lib/openpyxl/packaging/relationship.py", line 162, in get_rel
obj.deps = get_dependents(archive, rels_path)
File "/Users/gi/lib/openpyxl/packaging/relationship.py", line 130, in get_dependents
rels = RelationshipList.from_tree(node)
File "/Users/gi/lib/openpyxl/descriptors/serialisable.py", line 84, in from_tree
obj = desc.expected_type.from_tree(el)
File "/Users/gi/lib/openpyxl/descriptors/serialisable.py", line 100, in from_tree
return cls(**attrib)
File "/Users/gi/lib/openpyxl/packaging/relationship.py", line 50, in __init__
self.Target = Target
File "/Users/gi/lib/openpyxl/descriptors/base.py", line 44, in __set__
raise TypeError('expected ' + str(self.expected_type))
TypeError: expected <type 'basestring'>
You can specify the sheet that you want to manipulate:
wb = openpyxl.load_workbook('H:\\myfile.xlsx')
ws = wb['sheet1']
ws['E1'] = 'The sky is gray.'
wb.save('H:\\myfile.xlsx')
wb.close()
You can also get a list of all the sheet names if you need to check them first:
print(wb.sheetnames)
I am trying to write a code that will fetch an api from my csv as input:
from pyzillow.pyzillow import ZillowWrapper, GetDeepSearchResults, GetUpdatedPropertyDetails
def get_zillowinfo(address,zipcode):
zillow_data = ZillowWrapper('X1-ZWz17seirkzuh7_93aho')
deep_search_response = zillow_data.get_deep_search_results(address,zipcode)
result1 = GetDeepSearchResults(deep_search_response) #get zillowid from address
updated_property_details_response = zillow_data.get_updated_property_details(result1.zillow_id)
result2 = GetUpdatedPropertyDetails(updated_property_details_response) # get detail property info
result = result2.home_info
return result
print get_zillowinfo('73 main blvd','01545')
#2
import csv
with open(r'C:\Users\bca\Desktop\Pricing Study-Zillow\sample4.csv', 'r') as csvfile:
spamreader = csv.reader(csvfile,delimiter=',')
next(spamreader)
for row in spamreader:
print row
#3
import csv
with open(r'C:\Users\bca\Desktop\Pricing Study-Zillow\sample4.csv', 'r') as csvfile:
spamreader = csv.reader(csvfile,delimiter=',')
next(spamreader)
for row in spamreader:
print get_zillowinfo(row[0],row[1])
When i do step #3, I get an error:
Traceback (most recent call last):
File "C:\Users\bca\AppData\Local\Continuum\anaconda2\lib\site-packages\IPython\core\interactiveshell.py", line 2895, in run_code
self.showtraceback()
File "C:\Users\bca\AppData\Local\Continuum\anaconda2\lib\site-packages\IPython\core\interactiveshell.py", line 1828, in showtraceback
self._showtraceback(etype, value, stb)
File "C:\Users\bca\AppData\Local\Continuum\anaconda2\lib\site-packages\ipykernel\zmqshell.py", line 547, in _showtraceback
u'evalue' : py3compat.safe_unicode(evalue),
File "C:\Users\bca\AppData\Local\Continuum\anaconda2\lib\site-packages\ipython_genutils\py3compat.py", line 65, in safe_unicode
return unicode_type(e)
TypeError: coercing to Unicode: need string or buffer, dict found
Why does this happen? is it because my characters are not strings? how do I change it to string in that case for all my data?
This is a repex of my dataset:
What do I need to change in my code to avoid that type error?
i am new to python and i learn a lot everyday . I have a specific folder that contains some xml file and i am parsing xml text of PMID ,Date ,Title and Abstract and i am writing csv of for loop with if else statement but it not printing it gives error it how to write csv of for loop with if else condition
Here is my python Code :
import os
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import csv
path = '/home/shayez/Desktop/project/kk'
listfile = []
files = os.listdir(path)
for name in files:
listfile.append(name)
pmdata = []
for name2 in listfile:
full_file = os.path.abspath(os.path.join('project/kk',name2))
dom = ET.parse(full_file)
pmdat = dom.findall('PubmedArticle')
pmdata.append(pmdat)
def Print_Data ():
header = ['PMID','Date','Title','Abstract']
with open ('/home/shayez/Desktop/karim.csv','wt') as csvfile:
writer = csv.writer(csvfile, delimiter ="\t" )
writer.writerow(header)
for d in pmdata:
for c in d :
PMID = c.find('MedlineCitation/PMID').text
title = c.find('MedlineCitation/Article/ArticleTitle').text
Date = c.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
Date2 = c.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate')
Abstract = c.find('MedlineCitation/Article/Abstract/AbstractText')
rows = [PMID,Date,title,Abstract]
if Abstract is None :
print PMID,"\t",Date.text, "\t",title ,"\t", "No abstract Available"
elif Date2 is None:
print PMID,"\t",Date.text, "\t",title ,"\t",Abstract.text
elif Date is None:
print PMID,"\t",Date2.text, "\t",title ,"\t",Abstract.text
else :
print PMID,"\t","No Date", "\t",title ,"\t", "No abstract Available"
rows = [PMID,Date,title,Abstract]
writer.writerows(rows)
Print_Data()
Error :
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/dist-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 699, in runfile
execfile(filename, namespace)
File "/usr/lib/python2.7/dist-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 81, in execfile
builtins.execfile(filename, *where)
File "/home/shayez/Desktop/k.py", line 72, in <module>
Print_Data()
File "/home/shayez/Desktop/k.py", line 67, in Print_Data
writer.writerows(rows)
ValueError: I/O operation on closed file
You need your for loop to be inside the with block, otherwise it will close csvfile:
def Print_Data ():
header = ['PMID','Date','Title','Abstract']
with open ('/home/shayez/Desktop/karim.csv','wt') as csvfile:
writer = csv.writer(csvfile, delimiter ="\t" )
writer.writerow(header)
for d in pmdata:
for c in d :
PMID = c.find('MedlineCitation/PMID').text
title = c.find('MedlineCitation/Article/ArticleTitle').text
Date = c.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
Date2 = c.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate')
Abstract = c.find('MedlineCitation/Article/Abstract/AbstractText')
rows = [PMID,Date,title,Abstract]
# Other code
writer.writerows(rows)
Now that you've moved your writer inside your with block, we can address the other issue. writer.writerows() expects an iterable of row objects. Your rows object is a single row, throwing an exception. To accomplish what you want to do, you'll have to collect all of the row entries you want to write into a list:
with open ('/home/shayez/Desktop/karim.csv','wt') as csvfile:
writer = csv.writer(csvfile, delimiter ="\t" )
writer.writerow(header)
rows = []
for d in pmdata:
for c in d:
# code to get PMID, Date, title, Abstract
rows.append([PMID, Date, title, Abstract])
# Note that this is after your two for loops
writer.writerows(rows)
The other option you have is to use writer.writerow(row) inside the for loop on each row
I have a bunch of xlsx files, named from 1 to 169 like '1.xlsx', '2.xlsx' and so on... But while going through for loop, that read that files, the code does not see any rows in the 11th file (nrows in 11th file always is 0, while it is not if you open it manually) and gives me the IndexError (while these files are not empty).
I have no idea of what is going on with that code.
import os, xlwt, xlrd
file_dir = 'docs/'
files = os.listdir(file_dir)
#Open file and, read neaded variables and write them
def r_file(path, file):
workbook = xlrd.open_workbook(path+file)
info_sheet = workbook.sheet_by_index(0)
data_sheet = workbook.sheet_by_index(1)
#cells with company info
print info_sheet.nrows
company_name = info_sheet.cell(3,3).value
company_leg_adress = info_sheet.cell(4,3).value
company_fact_adress = info_sheet.cell(5,3).value
#cells with answers
question_1 = data_sheet.cell(3,10).value
question_1_1 = data_sheet.cell(8,2).value
question_1_2 = data_sheet.cell(13,2).value
question_2 = data_sheet.cell(18,10).value
question_3 = data_sheet.cell(25,10).value
question_3_additional = [data_sheet.cell(nrow,10).value for nrow in range(30,48)]
question_4 = data_sheet.cell(51,2).value
question_5 = data_sheet.cell(56,2).value
#get full row in list
row_as_list = [company_name,company_leg_adress,company_fact_adress, question_1, question_1_1, question_1_2, question_2, question_3, question_4]+question_3_additional
return row_as_list
#write companies in file
def w_file(companies):
wb = xlwt.Workbook()
ws = wb.add_sheet('aggr', cell_overwrite_ok=True)
for company in companies:
row_as_list = r_file(file_dir,str(company)+'.xlsx')
for each_index in row_as_list:
ws.write(company, row_as_list.index(each_index) , each_index)
wb.save('aggregation.xls')
companies_amount = [x for x in range(1,170)]
w_file(companies_amount)
after running it, it returns:
Traceback (most recent call last):
File "/home/ubuntu/workspace/ex50/bin/writing.py", line 44, in <module>
w_file(companies_amount)
File "/home/ubuntu/workspace/ex50/bin/writing.py", line 36, in w_file
row_as_list = r_file(file_dir,str(company)+'.xlsx')
File "/home/ubuntu/workspace/ex50/bin/writing.py", line 13, in r_file
company_name = info_sheet.cell(3,3).value
File "/usr/local/lib/python2.7/dist-packages/xlrd-1.0.0-py2.7.egg/xlrd/sheet.py", line 401, in cell
self._cell_types[rowx][colx],
IndexError: list index out of range
and it makes it only on the 11th file (no matter wich file will be the 11th).
Can you tell me what is going on with that thing?