I am trying to search and replace certain words in my .xml file and replace it with another, but I struggle a bit.
I have been using this code so far:
import xml.etree.ElementTree as ET
with open('Rom1.xml', encoding="utf8") as f:
tree = ET.parse(f)
#root = tree.find('ExportedObjects')
root = tree.getroot()
for elem in root.iter():
try:
elem.text = elem.text.replace('Rom1', 'Rom2')
except AttributeError:
pass
Rom1.xml this is a snapshot from the XML file showing the structure
The XML file is pretty big but it contains the string 'Rom1' 41 times and I would like to replace all of them.
I know a simple search and replace in text editor does the job, but I want to automate this since I will do it for several hundered of files.
Any help is appriciated :)
If there is no possibility of ambiguity then you could just do this:
with open('Rom1.xml', encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('Rom1', 'Rom2')
xml.seek(0)
xml.write(content)
xml.truncate()
In this case the truncate() call is not necessary. However, if the second argument to replace() was shorter than the first then this would be crucial. Just leave it there to account for all eventualities
Ok so I tried something else with great success:
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
But if I want to replace a second string f.ex 'SQ4XXX' to 'SQ4050' then it replaces both and keeps the old as well? I'm confused.
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
sq = input('SQ: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
f.write(line.replace('SQ4XXX', sq))
Ok I got it working like I wanted, thanks for the help guys!
Heres the final code:
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
sq4 = input('SQ4: ')
sq5 = input('SQ5: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
with open(output_file, encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('SQ4XXX', sq4)
xml.seek(0)
xml.write(content)
xml.truncate()
with open(output_file, encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('SQ5XXX', sq5)
xml.seek(0)
xml.write(content)
xml.truncate()er code here
Related
The code below takes an XML file and parses specific elements into a CSV file. Regarding the code I had simpler and different code that had a slightly different out, the code below is as an outcome of a lot help from here.
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
tree = ET.parse('thexmlfile.xml')
root = tree.getroot()
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
start_nodes = root.findall('.//START')
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
for sn in start_nodes:
row = defaultdict(str)
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['description_txt'] = ds.text
row['description_num'] = ds.attrib['num']
# all other tags except set data must be parsed before this.
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
I'm trying to make that this code goes to a folder that has many XML files and parses them into one single CSV file. Simply said instead of parsing one XML file , do this for multiple XMLs and parse them to one csv file.
What I would normally do is use os.listdir(): . The code would look something like this
directory = 'C:/Users/docs/FolderwithXMLs'
for filename in os.listdir(directory):
if filename.endswith(".xml"):
#Something here
df.to_csv("./output.csv")
continue
else:
continue
I have tried different ways to implement this into the code from above without success until now. Considering that this process should also be fast.
Try:
from pathlib import Path
directory = 'C:/Users/docs/FolderwithXMLs'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
# Rest of the code here.
Hope that helps.
I would like to write new data to the beginning of my text file, with the previous data shifting down 1 line each time new data is imported, I would like everything to be organized, but every time I import something gets deleted.
Code:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
response = requests.get('https://www.lotteryusa.com/michigan/lucky-4-life/')
soup = BeautifulSoup(response.text, 'html.parser')
date = soup.find(class_='date')
results = soup.find(class_='draw-result list-unstyled list-inline')
d = datetime.strptime(date.time['datetime'], '%Y-%m-%d')
Lucky = (d.strftime("%m%d%Y")+(',')+results.get_text()[:-20].strip().replace('\n',','))
print(Lucky)
with open("webscraper2noteppad++", "r+") as f:
file = f.readlines()
f.seek(0,0)
f.write(Lucky)
Also tried doing this
with open("webscraper2noteppad++", "r+") as f:
file = f.read()
f.seek(0,0)
f.write(Lucky + '\n')
but I have to put 10 lines between the already existing data, and the new data. So it can be can be imported on top without deleting.
You can first read the content of your file, the append it to the new data and then write everything to the file:
with open("webscraper2noteppad++", "r") as f:
data = f.read()
with open("webscraper2noteppad++", "w") as f:
f.write('{}{}{}'.format(lucky, '\n' if data else '', data))
I can change the text in a file line by line, but I don't know how to write the results (changes) into the file.
This is a small part of my file:
<name>2016-09-15_obere-firstalm_gehen-6,5km</name>
<extensions>
<line xmlns="http://www.topografix.com/GPX/gpx_style/0/2">
<color>000000</color>
</line>
</extensions>
<trkseg>
<trkpt lat="47.671193" lon="11.886518">
<ele>1115.6</ele>
<time>2016-09-15T11:57:44Z</time>
</trkpt>
<trkpt lat="47.670686" lon="11.886412">
<ele>1117.6</ele>
<time>2016-09-15T11:58:14Z</time>
</trkpt>
<trkpt lat="47.670821" lon="11.886459">
<ele>1055.6</ele>
<time>2016-09-15T11:58:44Z</time>
</trkpt>
With a Python script I change values of elevations by adding 30.
Example:
Before change:
elevation (ele) 1115.6,
after change:
elevation (ele) 1145.6
#This little Python adds 30 to elevation:
import re
f1 = raw_input("name of your GPX file: ")
f1 = open(f1,'r+')
for line in f1:
res = re.search(r"<(ele)>(.+)</\1>",line)
if res:
number=float(res.group(2))
number_elev=number+30
number_elev=str(number_elev)
ress = re.sub(r"<(ele)>(.+)", r"\2",number_elev)
#print shows correct new values between <ele> and </ele>
print ress + "\n"
###but how to write into the gpx file these changes?
f1.close()
print "OK"
Expected: Write the file with the changed lines.
Actual: I don't know how to write a change by regexp into the file.
Thx in advance for your help.
You'll first need to read all the lines in your file and write each one to the file replacing the ones matching the regex search.
Also since one more <ele>..</ele> tag sequences can be on the same line, you'll need to find all occurrences of those in the line and replace them accordingly.
import re
f1 = raw_input("name of your GPX file: ")
with open(f1,'r') as f:
lines = f.readlines()
with open(f1, 'w') as f:
for line in lines:
ress = line
res = re.findall(r"<(ele)>(.+)</\1>",ress)
if res:
for r in res:
number=float(r[1])
number_elev=number+30
number_elev=str(number_elev)
ress=re.sub(r"<(ele)>{}</(ele)>".format(r[1]), r"<ele>{}</ele>".format(number_elev),string=ress, count=1)
f.write(ress)
Don't try to read and write from/to the same file at the same time. Just create and output file and write to it.
The following code is untested but it should work.
import re
f1 = input("name of your GPX file: ")
input_file = open(f1,'r+')
output_file = open(f1 + '_output', 'w+')
for line in input_file:
res = re.search(r"<(ele)>(.+)</\1>", line)
if res:
number=float(res.group(2))
number_elev=number+30
number_elev=str(number_elev)
line = line.replace(res.group(2), number_elev)
output_file.write(line)
input_file.close()
output_file.close()
print("OK")
You can read the file all at once and apply the regex to the data and write out the modified data to another file as follows:
import re
with open('input-file.xml') as fd:
data = fd.read()
regex = re.compile('(<ele>)([\d.]+)(</ele>)')
while True:
match = regex.search(data)
if not match:
break
new_value = float(match.group(2)) + 30
# <ele>6373.8</ele> becomes </ele>6373.8<ele> so that it doesnt match again
data = regex.sub(r'\g<3>{}\g<1>'.format(new_value), data, count=1)
# undo </ele>...<ele> the tag reversal done inside the while loop
regex = re.compile('(</ele>)([\d.]+)(<ele>)')
data = regex.sub(r'\3\2\1', data)
with open('output-file.xml', 'w') as fd:
fd.write(data)
I'm scraping rss feed from a web site (http://www.gfrvitale.altervista.org/index.php/autismo-in?format=feed&type=rss).
I have wrote down a script to extract and purifie the text from every of the feed. My main problem is to save each text of each item in a different file, I also need to name each file with it's proper title exctractet from the item.
My code is:
for item in myFeed["items"]:
time_structure=item["published_parsed"]
dt = datetime.fromtimestamp(mktime(time_structure))
if dt>t:
link=item["link"]
response= requests.get(link)
doc=Document(response.text)
doc.summary(html_partial=False)
# extracting text
h = html2text.HTML2Text()
# converting
h.ignore_links = True #ignoro i link
h.skip_internal_links=True #ignoro i link esterni
h.inline_links=True
h.ignore_images=True #ignoro i link alle immagini
h.ignore_emphasis=True
h.ignore_anchors=True
h.ignore_tables=True
testo= h.handle(doc.summary()) #testo estratto
s = doc.title()+"."+" "+testo #contenuto da stampare nel file finale
tit=item["title"]
# save each file with it's proper title
with codecs.open("testo_%s", %tit "w", encoding="utf-8") as f:
f.write(s)
f.close()
The error is:
File "<ipython-input-57-cd683dec157f>", line 34 with codecs.open("testo_%s", %tit "w", encoding="utf-8") as f:
^
SyntaxError: invalid syntax
You need to put the comma after %tit
should be:
#save each file with it's proper title
with codecs.open("testo_%s" %tit, "w", encoding="utf-8") as f:
f.write(s)
f.close()
However, if your file name has invalid characters it will return an error (i.e [Errno 22])
You can try this code:
...
tit = item["title"]
tit = tit.replace(' ', '').replace("'", "").replace('?', '') # Not the best way, but it could help for now (will be better to create a list of stop characters)
with codecs.open("testo_%s" %tit, "w", encoding="utf-8") as f:
f.write(s)
f.close()
Other way using nltk:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tit = item["title"]
tit = tokenizer.tokenize(tit)
tit = ''.join(tit)
with codecs.open("testo_%s" %tit, "w", encoding="utf-8") as f:
f.write(s)
f.close()
First off, you misplaced the comma, it should be after the %tit not before.
Secondly, you don't need to close the file because the with statement that you use, does it automatically for you. And where did the codecs came from? I don't see it anywhere else.... anyway, the correct with statement would be:
with open("testo_%s" %tit, "w", encoding="utf-8") as f:
f.write(s)
I have data which is being accessed via http request and is sent back by the server in a comma separated format, I have the following code :
site= 'www.example.com'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
soup = soup.get_text()
text=str(soup)
The content of text is as follows:
april,2,5,7
may,3,5,8
june,4,7,3
july,5,6,9
How can I save this data into a CSV file.
I know I can do something along the lines of the following to iterate line by line:
import StringIO
s = StringIO.StringIO(text)
for line in s:
But i'm unsure how to now properly write each line to CSV
EDIT---> Thanks for the feedback as suggested the solution was rather simple and can be seen below.
Solution:
import StringIO
s = StringIO.StringIO(text)
with open('fileName.csv', 'w') as f:
for line in s:
f.write(line)
General way:
##text=List of strings to be written to file
with open('csvfile.csv','wb') as file:
for line in text:
file.write(line)
file.write('\n')
OR
Using CSV writer :
import csv
with open(<path to output_csv>, "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
for line in data:
writer.writerow(line)
OR
Simplest way:
f = open('csvfile.csv','w')
f.write('hi there\n') #Give your csv text here.
## Python will convert \n to os.linesep
f.close()
You could just write to the file as you would write any normal file.
with open('csvfile.csv','wb') as file:
for l in text:
file.write(l)
file.write('\n')
If just in case, it is a list of lists, you could directly use built-in csv module
import csv
with open("csvfile.csv", "wb") as file:
writer = csv.writer(file)
writer.writerows(text)
I would simply write each line to a file, since it's already in a CSV format:
write_file = "output.csv"
with open(write_file, "wt", encoding="utf-8") as output:
for line in text:
output.write(line + '\n')
I can't recall how to write lines with line-breaks at the moment, though :p
Also, you might like to take a look at this answer about write(), writelines(), and '\n'.
To complement the previous answers, I whipped up a quick class to write to CSV files. It makes it easier to manage and close open files and achieve consistency and cleaner code if you have to deal with multiple files.
class CSVWriter():
filename = None
fp = None
writer = None
def __init__(self, filename):
self.filename = filename
self.fp = open(self.filename, 'w', encoding='utf8')
self.writer = csv.writer(self.fp, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL, lineterminator='\n')
def close(self):
self.fp.close()
def write(self, elems):
self.writer.writerow(elems)
def size(self):
return os.path.getsize(self.filename)
def fname(self):
return self.filename
Example usage:
mycsv = CSVWriter('/tmp/test.csv')
mycsv.write((12,'green','apples'))
mycsv.write((7,'yellow','bananas'))
mycsv.close()
print("Written %d bytes to %s" % (mycsv.size(), mycsv.fname()))
Have fun
What about this:
with open("your_csv_file.csv", "w") as f:
f.write("\n".join(text))
str.join() Return a string which is the concatenation of the strings in iterable.
The separator between elements is
the string providing this method.
In my situation...
with open('UPRN.csv', 'w', newline='') as out_file:
writer = csv.writer(out_file)
writer.writerow(('Name', 'UPRN','ADMIN_AREA','TOWN','STREET','NAME_NUMBER'))
writer.writerows(lines)
you need to include the newline option in the open attribute and it will work
https://www.programiz.com/python-programming/writing-csv-files