Parse many XML files to one CSV file

Parse many XML files to one CSV file - python

The code below takes an XML file and parses specific elements into a CSV file. Regarding the code I had simpler and different code that had a slightly different out, the code below is as an outcome of a lot help from here.
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
tree = ET.parse('thexmlfile.xml')
root = tree.getroot()
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
start_nodes = root.findall('.//START')
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
for sn in start_nodes:
row = defaultdict(str)
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['description_txt'] = ds.text
row['description_num'] = ds.attrib['num']
# all other tags except set data must be parsed before this.
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
I'm trying to make that this code goes to a folder that has many XML files and parses them into one single CSV file. Simply said instead of parsing one XML file , do this for multiple XMLs and parse them to one csv file.
What I would normally do is use os.listdir(): . The code would look something like this
directory = 'C:/Users/docs/FolderwithXMLs'
for filename in os.listdir(directory):
if filename.endswith(".xml"):
#Something here
df.to_csv("./output.csv")
continue
else:
continue
I have tried different ways to implement this into the code from above without success until now. Considering that this process should also be fast.

Try:
from pathlib import Path
directory = 'C:/Users/docs/FolderwithXMLs'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
# Rest of the code here.
Hope that helps.

Related

Search and replace strings in XML using python

I am trying to search and replace certain words in my .xml file and replace it with another, but I struggle a bit.
I have been using this code so far:
import xml.etree.ElementTree as ET
with open('Rom1.xml', encoding="utf8") as f:
tree = ET.parse(f)
#root = tree.find('ExportedObjects')
root = tree.getroot()
for elem in root.iter():
try:
elem.text = elem.text.replace('Rom1', 'Rom2')
except AttributeError:
pass
Rom1.xml this is a snapshot from the XML file showing the structure
The XML file is pretty big but it contains the string 'Rom1' 41 times and I would like to replace all of them.
I know a simple search and replace in text editor does the job, but I want to automate this since I will do it for several hundered of files.
Any help is appriciated :)

If there is no possibility of ambiguity then you could just do this:
with open('Rom1.xml', encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('Rom1', 'Rom2')
xml.seek(0)
xml.write(content)
xml.truncate()
In this case the truncate() call is not necessary. However, if the second argument to replace() was shorter than the first then this would be crucial. Just leave it there to account for all eventualities

Ok so I tried something else with great success:
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
But if I want to replace a second string f.ex 'SQ4XXX' to 'SQ4050' then it replaces both and keeps the old as well? I'm confused.
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
sq = input('SQ: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
f.write(line.replace('SQ4XXX', sq))

Ok I got it working like I wanted, thanks for the help guys!
Heres the final code:
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
sq4 = input('SQ4: ')
sq5 = input('SQ5: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
with open(output_file, encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('SQ4XXX', sq4)
xml.seek(0)
xml.write(content)
xml.truncate()
with open(output_file, encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('SQ5XXX', sq5)
xml.seek(0)
xml.write(content)
xml.truncate()er code here

Looping through two sets of data to conditionally append to CSV file

I'm writing a script that should: 1) open a CSV file 2) loop through some eBay results data 3) write details from this data to the same file if it matches a search term and if it's not already present, but it has a few issues:
The headers are not written if not present, so I'm manually adding them. I was previously using DictWriter() with writeheader(), but fieldnames was required, so they were written each time
The if result_id != existing_result_id: condition doesn't work in a nested loop, so the rows aren't written. if any(x in result_title.upper() for x in search_terms): does currently work, but I'm not sure how to combine them before writing the rows
import csv
import smtplib
from requests_html import HTMLSession
SEARCH_TERMS = ['one', 'two']
session = HTMLSession()
response = session.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=blink+182&_sacat=0&_sop=10&rt=nc&LH_PrefLoc=2')
results = response.html.find('ul.srp-results li.s-item')
with open('ebay-results.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # Skip headers
with open('ebay-results.csv', 'a') as csv_file_updated:
# field_names = ['Title', 'Link', 'ID']
csv_writer = csv.writer(csv_file_updated)
search_terms = [x.upper() for x in SEARCH_TERMS]
for result in results:
result_title = result.find('h3.s-item__title', first=True).text.replace('New Listing', '')
result_link = result.find('a.s-item__link', first=True).attrs['href'].split('?')[0]
result_id = result_link.split('/')[4].split('?')[0]
result_info = [result_title, result_link, result_id]
if any(x in result_title.upper() for x in search_terms):
for line in csv_reader:
existing_result_id = line[2]
if result_id != existing_result_id:
csv_writer.writerow(result_info)
send_email(search_terms, result_link)

There's a few issues with your code:
you create a file handle to read a file and then another file handle to append to the same file; that's dodgy at best, do you expect the reader to read lines you've appended? What's the purpose?
you exhaust the reader with for line in csv_reader: for every result, which will only work once, since you don't seek the start of the file before rereading; however, why reread the file over and over anyway?
standard Python indentation is 4 deep, you would do well to follow convention, as not doing so just makes your code harder to read, maintain and reuse.
It appears you simply want to write results for identifiers you haven't written results for previously to the file. Why not do this:
read the results once, keeping the identifiers for a quick lookup
then loop over the results, writing results with new identifiers
This is likely what you were after:
import csv
from requests_html import HTMLSession
SEARCH_TERMS = ['one', 'two']
session = HTMLSession()
response = session.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=blink+182&_sacat=0&_sop=10&rt=nc&LH_PrefLoc=2')
results = response.html.find('ul.srp-results li.s-item')
with open('ebay-results.csv', 'r+', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # Skip headers
existing_ids = [rec[2] for rec in csv_reader]
# field_names = ['Title', 'Link', 'ID']
csv_writer = csv.writer(csv_file)
search_terms = [x.upper() for x in SEARCH_TERMS]
for result in results:
result_title = result.find('h3.s-item__title', first=True).text.replace('New Listing', '')
result_link = result.find('a.s-item__link', first=True).attrs['href'].split('?')[0]
result_id = result_link.split('/')[4].split('?')[0]
result_info = [result_title, result_link, result_id]
if any(x in result_title.upper() for x in search_terms):
if result_id not in existing_ids:
csv_writer.writerow(result_info)
# not implemented, but not relevant to the question
# send_email(search_terms, result_link)
You asked how to deal with the headers, another issue might be that the .csv doesn't exist when first running. Something like this would solve both:
from pathlib import Path
# create file if it doesn't exist, write header to it
if not Path('ebay-results.csv').is_file():
with open('ebay-results.csv', 'w') as csv_file:
csv_file.write('Title,Link,ID\n')
# reopen the file for reading, which now must exist, and has a header
with open('ebay-results.csv', 'r+', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
field_names = next(csv_reader)

Generate XML files based on rows in CSV

I have a CSV and would like generate an XML file based on each row in the CSV.
Right now it creates an XML file but only with the last row in the CSV. How can I modify this script to generate an XML file for EACH row. And ideally have the filename based on the Column: "File / Entity Name". See below for what I currently have, Thanks!
# CSV module
import csv
# Stuff from the XML module
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree
import xml.etree.ElementTree as etree
# Topmost XML element
root = Element('root')
number = Element('number')
# Open a file
with open(r'U:\PROJECTS\Technical Graphics\book1.csv') as f:
for row in csv.DictReader(f):
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
for k, v in row.items():
child = SubElement(root, k)
child.text = v
reader = csv.DictReader(f)
tree.write(open(r'U:\PROJECTS\Technical Graphics\test.xml','w'))
print tostring(root)

You set the value of Root here:
for row in csv.DictReader(f):
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
filename = row.items()[7] # where 7 is the column your interested in
for k, v in row.items():
child = SubElement(root, k)
child.text = v
reader = csv.DictReader(f)
tree.write(open(r'U:\PROJECTS\Technical Graphics\' + filename + '.xml','w'))
print tostring(root)

You only want to create the csv.DictReader() class once, rather than for each iteration of your loop.
Similarly, you only want to create your root XML element once.
Finally, the order of the items returned from row.items() is arbitrary, and not reflective of the order of the fields in the file.
Try this:
# CSV module
import csv
# Stuff from the XML module
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree
import xml.etree.ElementTree as etree
# Topmost XML element
root = Element('root')
number = Element('number')
# Open a file
with open(r'U:\PROJECTS\Technical Graphics\book1.csv') as f:
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
reader = csv.DictReader(f)
for row in reader:
xml_row = SubElement(root, "row")
for k in reader.fieldnames:
child = SubElement(xml_row, k)
child.text = row[k]
tree.write(open(r'U:\PROJECTS\Technical Graphics\test.xml','w'))
print tostring(root)

extracting data from CSV file using a reference

I have a csv file with several hundred organism IDs and a second csv file with several thousand organism IDs and additional characteristics (taxonomic information, abundances per sample, etc)
I am trying to write a code that will extract the information from the larger csv using the smaller csv file as a reference. Meaning it will look at both smaller and larger files, and if the IDs are in both files, it will extract all the information form the larger file and write that in a new file (basically write the entire row for that ID).
so far I have written the following, and while the code does not error out on me, I get a blank file in the end and I don't exactly know why. I am a graduate student that knows some simple coding but I'm still very much a novice,
thank you
import sys
import csv
import os.path
SparCCnames=open(sys.argv[1],"rU")
OTU_table=open(sys.argv[2],"rU")
new_file=open(sys.argv[3],"w")
Sparcc_OTUs=csv.writer(new_file)
d=csv.DictReader(SparCCnames)
ids=csv.DictReader(OTU_table)
for record in ids:
idstopull=record["OTUid"]
if idstopull[0]=="OTUid":
continue
if idstopull[0] in d:
new_id.writerow[idstopull[0]]
SparCCnames.close()
OTU_table.close()
new_file.close()

I'm not sure what you're trying to do in your code but you can try this:
def csv_to_dict(csv_file_path):
csv_file = open(csv_file_path, 'rb')
csv_file.seek(0)
sniffdialect = csv.Sniffer().sniff(csv_file.read(10000), delimiters='\t,;')
csv_file.seek(0)
dict_reader = csv.DictReader(csv_file, dialect=sniffdialect)
csv_file.seek(0)
dict_data = []
for record in dict_reader:
dict_data.append(record)
csv_file.close()
return dict_data
def dict_to_csv(csv_file_path, dict_data):
csv_file = open(csv_file_path, 'wb')
writer = csv.writer(csv_file, dialect='excel')
headers = dict_data[0].keys()
writer.writerow(headers)
# headers must be the same with dat.keys()
for dat in dict_data:
line = []
for field in headers:
line.append(dat[field])
writer.writerow(line)
csv_file.close()
if __name__ == "__main__":
big_csv = csv_to_dict('/path/to/big_csv_file.csv')
small_csv = csv_to_dict('/path/to/small_csv_file.csv')
output = []
for s in small_csv:
for b in big_csv:
if s['id'] == b['id']:
output.append(b)
if output:
dict_to_csv('/path/to/output.csv', output)
else:
print "Nothing."
Hope that will help.

You need to read the data into a data structure, assuming OTUid is unique you can store this into a dictionary for fast lookup:
with open(sys.argv[1],"rU") as SparCCnames:
d = csv.DictReader(SparCCnames)
fieldnames = d.fieldnames
data = {i['OTUid']: i for i in d}
with open(sys.argv[2],"rU") as OTU_table, open(sys.argv[3],"w") as new_file:
Sparcc_OTUs = csv.DictWriter(new_file, fieldnames)
ids = csv.DictReader(OTU_table)
for record in ids:
if record['OTUid'] in data:
Sparcc_OTUs.writerow(data[record['OTUid']])

Thank you everyone for your help. I played with things and consulted with an advisor, and finally got a working script. I am posting it in case it helps someone else in the future.
Thanks!
import sys
import csv
input_file = csv.DictReader(open(sys.argv[1], "rU")) #has all info
ref_list = csv.DictReader(open(sys.argv[2], "rU")) #reference list
output_file = csv.DictWriter(
open(sys.argv[3], "w"), input_file.fieldnames) #to write output file with headers
output_file.writeheader() #write headers in output file
white_list={} #create empty dictionary
for record in ref_list: #for every line in my reference list
white_list[record["Sample_ID"]] = None #store into the dictionary the ID's as keys
for record in input_file: #for every line in my input file
record_id = record["Sample_ID"] #store ID's into variable record_id
if (record_id in white_list): #if the ID is in the reference list
output_file.writerow(record) #write the entire row into a new file
else: #if it is not in my reference list
continue #ignore it and continue iterating through the file

How to convert CSV file to multiline JSON?

Here's my code, really simple stuff...
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
out = json.dumps( [ row for row in reader ] )
jsonfile.write(out)
Declare some field names, the reader uses CSV to read the file, and the filed names to dump the file to a JSON format. Here's the problem...
Each record in the CSV file is on a different row. I want the JSON output to be the same way. The problem is it dumps it all on one giant, long line.
I've tried using something like for line in csvfile: and then running my code below that with reader = csv.DictReader( line, fieldnames) which loops through each line, but it does the entire file on one line, then loops through the entire file on another line... continues until it runs out of lines.
Any suggestions for correcting this?
Edit: To clarify, currently I have: (every record on line 1)
[{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"},{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}]
What I'm looking for: (2 records on 2 lines)
{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"}
{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}
Not each individual field indented/on a separate line, but each record on it's own line.
Some sample input.
"John","Doe","001","Message1"
"George","Washington","002","Message2"

The problem with your desired output is that it is not valid json document,; it's a stream of json documents!
That's okay, if its what you need, but that means that for each document you want in your output, you'll have to call json.dumps.
Since the newline you want separating your documents is not contained in those documents, you're on the hook for supplying it yourself. So we just need to pull the loop out of the call to json.dump and interpose newlines for each document written.
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
json.dump(row, jsonfile)
jsonfile.write('\n')

You can use Pandas DataFrame to achieve this, with the following Example:
import pandas as pd
csv_file = pd.DataFrame(pd.read_csv("path/to/file.csv", sep = ",", header = 0, index_col = False))
csv_file.to_json("/path/to/new/file.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)

import csv
import json
file = 'csv_file_name.csv'
json_file = 'output_file_name.json'
#Read CSV File
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Convert csv data into json
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': '))) #for pretty
f.write(json.dumps(data))
read_CSV(file,json_file)
Documentation of json.dumps()

I took #SingleNegationElimination's response and simplified it into a three-liner that can be used in a pipeline:
import csv
import json
import sys
for row in csv.DictReader(sys.stdin):
json.dump(row, sys.stdout)
sys.stdout.write('\n')

You can try this
import csvmapper
# how does the object look
mapper = csvmapper.DictMapper([
[
{ 'name' : 'FirstName'},
{ 'name' : 'LastName' },
{ 'name' : 'IDNumber', 'type':'int' },
{ 'name' : 'Messages' }
]
])
# parser instance
parser = csvmapper.CSVParser('sample.csv', mapper)
# conversion service
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
Edit:
Simpler approach
import csvmapper
fields = ('FirstName', 'LastName', 'IDNumber', 'Messages')
parser = CSVParser('sample.csv', csvmapper.FieldMapper(fields))
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)

I see this is old but I needed the code from SingleNegationElimination however I had issue with the data containing non utf-8 characters. These appeared in fields I was not overly concerned with so I chose to ignore them. However that took some effort. I am new to python so with some trial and error I got it to work. The code is a copy of SingleNegationElimination with the extra handling of utf-8. I tried to do it with https://docs.python.org/2.7/library/csv.html but in the end gave up. The below code worked.
import csv, json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("Scope","Comment","OOS Code","In RMF","Code","Status","Name","Sub Code","CAT","LOB","Description","Owner","Manager","Platform Owner")
reader = csv.DictReader(csvfile , fieldnames)
code = ''
for row in reader:
try:
print('+' + row['Code'])
for key in row:
row[key] = row[key].decode('utf-8', 'ignore').encode('utf-8')
json.dump(row, jsonfile)
jsonfile.write('\n')
except:
print('-' + row['Code'])
raise

Add the indent parameter to json.dumps
data = {'this': ['has', 'some', 'things'],
'in': {'it': 'with', 'some': 'more'}}
print(json.dumps(data, indent=4))
Also note that, you can simply use json.dump with the open jsonfile:
json.dump(data, jsonfile)

Use pandas and the json library:
import pandas as pd
import json
filepath = "inputfile.csv"
output_path = "outputfile.json"
df = pd.read_csv(filepath)
# Create a multiline json
json_list = json.loads(df.to_json(orient = "records"))
with open(output_path, 'w') as f:
for item in json_list:
f.write("%s\n" % item)

How about using Pandas to read the csv file into a DataFrame (pd.read_csv), then manipulating the columns if you want (dropping them or updating values) and finally converting the DataFrame back to JSON (pd.DataFrame.to_json).
Note: I haven't checked how efficient this will be but this is definitely one of the easiest ways to manipulate and convert a large csv to json.

As slight improvement to #MONTYHS answer, iterating through a tup of fieldnames:
import csv
import json
csvfilename = 'filename.csv'
jsonfilename = csvfilename.split('.')[0] + '.json'
csvfile = open(csvfilename, 'r')
jsonfile = open(jsonfilename, 'w')
reader = csv.DictReader(csvfile)
fieldnames = ('FirstName', 'LastName', 'IDNumber', 'Message')
output = []
for each in reader:
row = {}
for field in fieldnames:
row[field] = each[field]
output.append(row)
json.dump(output, jsonfile, indent=2, sort_keys=True)

def read():
noOfElem = 200 # no of data you want to import
csv_file_name = "hashtag_donaldtrump.csv" # csv file name
json_file_name = "hashtag_donaldtrump.json" # json file name
with open(csv_file_name, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
with open(json_file_name, 'w') as json_file:
i = 0
json_file.write("[")
for row in csv_reader:
i = i + 1
if i == noOfElem:
json_file.write("]")
return
json_file.write(json.dumps(row))
if i != noOfElem - 1:
json_file.write(",")
Change the above three parameter, everything will be done.

import csv
import json
csvfile = csv.DictReader('filename.csv', 'r'))
output =[]
for each in csvfile:
row ={}
row['FirstName'] = each['FirstName']
row['LastName'] = each['LastName']
row['IDNumber'] = each ['IDNumber']
row['Message'] = each['Message']
output.append(row)
json.dump(output,open('filename.json','w'),indent=4,sort_keys=False)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parse many XML files to one CSV file - python

Related

Search and replace strings in XML using python

Looping through two sets of data to conditionally append to CSV file

Generate XML files based on rows in CSV

extracting data from CSV file using a reference

How to convert CSV file to multiline JSON?

Categories

Resources