Converting all XML files to JSON files in a folder in python? - python

I have a folder with 1000's of XML files. Now I would like to read each xml file and create a json file for that respective xml file without deleting the xml file.
For example: if I have a file named abc.xml I want that xml file to be converted into json and stored as abc.json so the folder should have abc.xml, abc.json and it should be the same for all the files.
Currently, I am using the below code chunk to convert the xml to json but the issue is it is not creating a new json file.
for filename in os.listdir(path):
if not filename.endswith('.xml'): continue
fullname = os.path.join(path, filename)
with open(fullname, 'r') as f:
xmlString = f.read()
jsonString = json.dumps(xmltodict.parse(xmlString), indent=4)
with open(fullname, 'w') as x:
x.write(jsonString)
Any kind of relevant suggestions would be really appreciated.

fullname is the name of your input xml file. You want to change it to "same thing but json" else you'll create a file with xml extension but json data (destroying your original xml file, not that I care about xml much but...)
with open(fullname, 'r') as f:
xmlString = f.read()
json_output = os.path.splitext(fullname)[0]+".json"
now write your file directly (no need to create a string first)
with open(json_output, 'w') as f:
json.dump(xmltodict.parse(xmlString), f, indent=4)
note that you could use glob.glob directly like this so you get the full, xml filtered path:
for fullname in glob.glob(os.path.join(path,"*.xml"):

This should work.
import xmltodict
for filename in os.listdir(path):
if not filename.endswith('.xml'):
continue
fullname = os.path.join(path, filename)
with open(fullname, 'r') as f:
xmlString = f.read()
jsonString = json.dumps(xmltodict.parse(xmlString), indent=4)
with open(fullname[:-4] + ".json", 'w') as f:
f.write(jsonString)

Related

Getting "Xref table not zero-indexed. ID numbers for objects will be corrected" warning

I have the following code (comments explain what is occuring):
import os
from io import StringIO
from PyPDF2 import PdfFileReader
# Path to the directory containing the PDF files
pdf_dir = '/path/to/pdf/files'
# Iterate over the files in the directory
for filename in os.listdir(pdf_dir):
# Check if the file is a PDF file
if filename.endswith('.pdf'):
# Construct the full path to the file
filepath = os.path.join(pdf_dir, filename)
# Open the PDF file and read its contents
with open(filepath, 'rb') as f:
pdf = PdfFileReader(f)
# Extract the text from the PDF file
text = ''
for page in pdf.pages:
text += page.extractText()
# Construct the name of the output text file
txt_filename = filename[:-4] + '.txt'
# Write the text to the output file
with open(txt_filename, 'w') as f:
f.write(text)
When I run the code, it produces a Xref table not zero-indexed. ID numbers for objects will be corrected warning. It is not a hard error, but it makes me wonder if there's a different way I should be doing this.
Thanks for any suggestions.

Store, manipulate and retrieve content of docx files, retaining formatting

So I need a way to retrieve the content of docx files (text, images, foarmatting), store them and then generate a new docx with the content of some of the files stitched together.
My current approach is, that I extract the <body> from the underlying document.xml, store that in a Pandas DF and modify the content of a template docx with data form that DF, before generating a new docx.
Storing the body of the files in a Pandas DF seems easy enough:
def get_word_xml(docx_filename):
with open(docx_filename, 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return zip, tmp_dir, xml_content
def get_xml_tree(xml_string):
return etree.fromstring(xml_string)
df = pd.DataFrame(columns=['Name', 'Text'])
for root, dirs, files in os.walk("./docs", topdown=False):
for name in files:
zip, tmp_dir, wordxml = get_word_xml(os.path.join(root, name).replace("\\","/"))
wordxml = get_xml_tree(wordxml)
wordxml = etree.tostring(wordxml, pretty_print=True)
body = re.search("(?<=<w:body>)(.*)(?=<\/w:body>)",str(wordxml)).group(1)
df = df.append({'Name':name.split('.')[0], 'Text':body}, ignore_index=True)
The actual problem I'm facing is, however, that generating a docx file leads to a corrupted file. I tried opening a file, extracting the contents(not even manipulating the data at this point) and generate a new file with the same content(basically a copy):
with open('Test.docx', 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
tmp_dir = tempfile.mkdtemp()
zip.extractall(tmp_dir)
etree.fromstring(xml_content)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = str(xml_content)
f.write(str(xmlstr))
filenames = zip.namelist()
zip_copy_filename = 'output.docx'
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
shutil.rmtree(tmp_dir)
I'm not even sure if this is the right approach for this task, but I used this as reference.
There are several problems with your code:
etree.fromstring(xml_content)
This doesn't assign the XML Element created from xml_content to anything.
xmlstr = str(xml_content)
f.write(str(xmlstr))
First, you have an extra str conversion. Second, the correct way to convert the XML back to string is via the etree tostring() method.
Try the following code - on my (linux) system, the generated output.docx opens in LibreOffice Writer without problem. (BTW, next time please include complete code, including imports.)
#! /usr/bin/python3
import zipfile
import tempfile
import xml.etree.ElementTree as etree
import os.path
import shutil
with open('Test.docx', 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
tmp_dir = tempfile.mkdtemp()
zip.extractall(tmp_dir)
xml = etree.fromstring(xml_content)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = etree.tostring(xml, encoding="unicode", xml_declaration=True)
f.write(xmlstr)
filenames = zip.namelist()
zip_copy_filename = 'output.docx'
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
shutil.rmtree(tmp_dir)

how to gzip files using writelines?

I have a few files in my tmp folder that I want to gzip individually and upload to S3. The testList contains paths like /tmp/files/File1. SO fileName2, which I use for gzip.open() is /tmp/files/File1.gz. I want to gzip each file in the testList.
for i in testList:
fileName = i.replace("/tmp/files/", "")
fileName2 = i + '.gz'
with open("path/to/file", 'rb') as orig_file:
with gzip.open(fileName2, 'wb') as zipped_file:
zipped_file.writelines(orig_file)
bucket.upload_fileobj(zipped_file, fileName, ExtraArgs={'ContentType': "application/gzip"})
When I download the files from S3, they have a gz file type but I am unable to open them locally. It throws an error that the .gz file is empty and cannot be expanded. I believe the way I am writing content is incorrect.
How can I fix this?
Edit:
for i in testList:
fileName = i.replace("/tmp/files/", "")
fileName2 = i + '.gz'
with open(i, 'rb') as f_in:
with gzip.open(fileName2, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
f_out.upload_fileobj(zipped_file, fileName, ExtraArgs={'ContentType': "application/gzip"})
Even this, the gzip files are still not expandable.
You are getting an open file in orig_file, not just lines.
I think your use case is about turning an existing file into a compressed one. So the following should be the relevant paragraph from the Examples of usage section of the documentation:
Example of how to GZIP compress an existing file:
import gzip
import shutil
with open('/home/joe/file.txt', 'rb') as f_in:
with gzip.open('/home/joe/file.txt.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

How to replace a string in every file in a directory

I'm trying to replace with python a string: "XXXXXXXXXXX" with a new string: "ILoveStackOverflow" in every file in a particular folder (all the files in the folder are xml).
My code is as follows:
import os, fnmatch
for filename in os.listdir("C:/Users/Francesco.Borri/Desktop/passivo GME"):
if filename.endswith('.xml'):
with open(os.path.join("C:/Users/Francesco.Borri/Desktop/passivo GME", filename)) as f:
content = f.read()
content = content.replace("XXXXXXXXXXX", "ILoveStackOverflow")
with open(os.path.join("C:/Users/Francesco.Borri/Desktop/passivo GME", filename), mode="w") as f: #Long Pierre-André answer
f.write(content)
The next step would be to replace a different string: "YYYY" with a number that increases every time. If in my directory there are 10 files and I set the starting number 1, the first file "YYYY" will be replaced with 1, the second file with 2 and so on until 10.
You are close. When you open the file the second time, you have to open it in writeable mode to be able to write the content.
with open(os.path.join("C:/Users/Francesco.Borri/Desktop/passivo GME", filename), 'w') as f:
f.write(content)
Once you fix this, I think the second part of your question is just maintaining a variable whose value you increment everytime you replace the string. You could do it manually (iterate over the string), or use the replace function in a for loop:
with open(os.path.join("C:/Users/Francesco.Borri/Desktop/passivo GME", filename)) as f:
content = f.read()
for i in range(content.count("YYYY")):
content.replace("YYYY", str(i), 1) # or str(i+1)
with open(os.path.join("C:/Users/Francesco.Borri/Desktop/passivo GME", filename), 'w') as f:
f.write(content)
with open(os.path.join("C:/Users/Francesco.Borri/Desktop/passivo GME", filename), mode="w") as f:
You must open the file on writing mode.

IOError when downloading and decompressing gzip file

I'm trying to download and decompress a gzip file and then convert the resulting decompressed file which is of tsv format into a CSV format which would be easier to parse. I am trying to gather the data from the "Download Table" link in this URL. My code is as follows, where I am using the same idea as in this post, however I get the error IOError: [Errno 2] No such file or directory: 'file=data/irt_euryld_d.tsv' in the line with open(outFilePath, 'w') as outfile:
import os
import urllib2
import gzip
import StringIO
baseURL = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?"
filename = "D:\Sidney\irt_euryld_d.tsv.gz" #Edited after heinst's comment below
outFilePath = filename[:-3]
response = urllib2.urlopen(baseURL + filename)
compressedFile = StringIO.StringIO()
compressedFile.write(response.read())
compressedFile.seek(0)
decompressedFile = gzip.GzipFile(fileobj=compressedFile, mode='rb')
with open(outFilePath, 'w') as outfile:
outfile.write(decompressedFile.read())
#Now have to deal with tsv file
import csv
with open(outFilePath,'rb') as tsvin, open('ECB.csv', 'wb') as csvout:
tsvin = csv.reader(tsvin, delimiter='\t')
csvout = csv.writer(csvout) #Converting output into CSV Format
Thank You
The path you were setting filename to was not a valid path to have a file written to it. So you have to change filename = "data/irt_euryld_d.tsv.gz" to be a valid path to wherever you want the irt_euryld_d.tsv.gz file to live. For example if I wanted the irt_euryld_d.tsv.gz file on my desktop I would set the value of filename = "/Users/heinst/Desktop/data/irt_euryld_d.tsv.gz". Since this is a valid path, python will not give you the No such file or directory error anymore.

Categories

Resources