Need a bash script or python script to find and replace text between two tags?
E.g:
<start>text to find and replace with the one I give as input<end>
' text to find and replace with the one I give as input' is just an example and it could vary every time.
I want to do something like ./changetxt inputfile.xxx newtext
where changetxt has the script;
inputfile.xxx has the text that needs a change and newtext is what goes into inputfile.xxx
python:
import sys
if __name__ == "__main__":
#ajust these to your need
starttag = "<foo>"
endtag = "</foo>"
inputfilename = sys.argv[1]
outputfilename = inputfilename + ".out"
replacestr = sys.argv[2]
#open the inputfile from the first argument
inputfile = open(inputfilename, 'r')
#open an outputfile to put the result in
outputfile = open(outputfilename, 'w')
#test every line in the file for the starttag
for line in inputfile:
if starttag in line and endtag in line:
#compose a new line with the replaced string
newline = line[:line.find(starttag) + len(starttag)] + replacestr + line[line.find(endtag):]
#and write the new line to the outputfile
outputfile.write(newline)
else:
outputfile.write(line)
outputfile.close()
inputfile.close()
Save this in a replacetext.py file and run as python replacetext.py \path\to\inputfile "I want this text between the tags"
You could also use BeautifulSoup for this, from their docs:
If you set a tag’s .string attribute, the tag’s contents are replaced
with the string you give:
markup = 'I linked to <i>example.com</i>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.string = "New link text."
tag
# New link text.
Related
Here below is my code about how to edit text file.
Since python can't just edit a line and save it at the same time,
I save the previous text file's content into a list first then write it out.
For example,if there are two text files called sample1.txt and sample2.txt in the same folder.
Sample1.txt
A for apple.
Second line.
Third line.
Sample2.txt
First line.
An apple a day.
Third line.
Execute python
import glob
import os
#search all text files which are in the same folder with python script
path = os.path.dirname(os.path.abspath(__file__))
txtlist = glob.glob(path + '\*.txt')
for file in txtlist:
fp1 = open(file, 'r+')
strings = [] #create a list to store the content
for line in fp1:
if 'apple' in line:
strings.append('banana\n') #change the content and store into list
else:
strings.append(line) #store the contents did not be changed
fp2 = open (file, 'w+') # rewrite the original text files
for line in strings:
fp2.write(line)
fp1.close()
fp2.close()
Sample1.txt
banana
Second line.
Third line.
Sample2.txt
First line.
banana
Third line.
That's how I edit specific line for text file.
My question is : Is there any method can do the same thing?
Like using the other functions or using the other data type rather than list.
Thank you everyone.
Simplify it to this:
with open(fname) as f:
content = f.readlines()
content = ['banana' if line.find('apple') != -1 else line for line in content]
and then write value of content to file back.
Instead of putting all the lines in a list and writing it, you can read it into memory, replace, and write it using same file.
def replace_word(filename):
with open(filename, 'r') as file:
data = file.read()
data = data.replace('word1', 'word2')
with open(filename, 'w') as file:
file.write(data)
Then you can loop through all of your files and apply this function
The built-in fileinput module makes this quite simple:
import fileinput
import glob
with fileinput.input(files=glob.glob('*.txt'), inplace=True) as files:
for line in files:
if 'apple' in line:
print('banana')
else:
print(line, end='')
fileinput redirects print into the active file.
import glob
import os
def replace_line(file_path, replace_table: dict) -> None:
list_lines = []
need_rewrite = False
with open(file_path, 'r') as f:
for line in f:
flag_rewrite = False
for key, new_val in replace_table.items():
if key in line:
list_lines.append(new_val+'\n')
flag_rewrite = True
need_rewrite = True
break # only replace first find the words.
if not flag_rewrite:
list_lines.append(line)
if not need_rewrite:
return
with open(file_path, 'w') as f:
[f.write(line) for line in list_lines]
if __name__ == '__main__':
work_dir = os.path.dirname(os.path.abspath(__file__))
txt_list = glob.glob(work_dir + '/*.txt')
replace_dict = dict(apple='banana', orange='grape')
for txt_path in txt_list:
replace_line(txt_path, replace_dict)
I am trying to extract complete sentences from a long text file and adding them as strings to a list in Python 2.7. I want to automate this and not just cut and paste in the list.
Here is what I have:
from sys import argv
script, filename = argv # script = alien.py; filename = roswell.txt
listed = []
text = open(filename, 'rw')
for i in text:
lines = readline(i)
listed.append(lines)
print listed
text.close()
Nothing loads to the list.
You can do it with a while loop:
listed = []
with open(filename,"r") as text:
Line = text.readline()
while Line!='':
listed.append(Line)
Line = text.readline()
print listed
In the previous example, I assumed that each sentence is written on a different line, if that's not the case, use this code instead:
listed = []
with open(filename,"r") as text:
Line = text.readline()
while Line!='':
Line1 = Line.split(".")
for Sentence in Line1:
listed.append(Sentence)
Line = text.readline()
print listed
And on a side note, try using with open(...) as text: instead of text = open(...)
Normally sentences are separated by '. ', not '\n'. Under this condition, use split with period+space(without return-enter):
listed = []
fd = open(filename,"r")
try:
data = fd.read()
sentences = data.split(". ")
for sentence in sentences:
listed.append(sentence)
print listed
finally:
fd.close()
I have a script to clean urls to get base domains from example.com/example1 and example.com/example2 down to example.com My issue is when it goes to through the file of urls it will have duplicate base domains. I want to remove the duplicates while printing the urls to a file. below is the code I currently have.
enter from Tkinter import *
import tkFileDialog
import re
def main():
fileOpen = Tk()
fileOpen.withdraw() #hiding tkinter window
file_path = tkFileDialog.askopenfilename(
title="Open file", filetypes=[("txt file",".txt")])
if file_path != "":
print "you chose file with path:", file_path
else:
print "you didn't open anything!"
fin = open(file_path)
fout = open("URL Cleaned.txt", "wt")
for line in fin.readlines():
editor = (line.replace('[.]', '.')
.replace('[dot]', '.')
.replace('hxxp://www.', '')
.replace('hxxps://www.', '')
.replace('hxxps://', '')
.replace('hxxp://', '')
.replace('www.', '')
.replace('http://www.', '')
.replace('https://www.', '')
.replace('https://', '')
.replace('http://', ''))
editor = re.sub(r'/.*', '', editor)
if __name__ == '__main__':
main()
Any help is appreciated. I have scoured the posts and tried all of the suggestions for my issue and have not found one that works.
You can use regular expresion to find the base domains.
If you have one url per line in your file:
import re
def main():
file = open("url.txt",'r')
domains = set()
# will works for any web like https://www.domain.com/something/somethingmore... , also without www, without https or just for www.domain.org
matcher= re.compile("(h..ps?://)?(?P<domain>(www\.)?[^/]*)/?.*")
for line in file:
# make here any replace you need with obfuscated urls like: line = line.replace('[.]','.')
if line[-1] == '\n': # remove "\n" from end of line if present
line = line[0:-1]
match = matcher.search(line)
if match != None: # If a url has been found
domains.add(match.group('domain'))
print domains
file.close()
main()
For example, with this file, it will print:
set(['platinum-shakers.net', 'wmi.ns01.us', 'adservice.no-ip.org', 'samczeruno.pl', 'java.ns1.name', 'microsoft.dhcp.biz', 'ids.us01.us', 'devsite.quostar.com', 'orlandmart.com'])
perhaps you could use a regular expression:
import re
p = re.compile(r".*\.com/(.*)") # to get for instance 'example1' or 'example2' etc.
with open(file_path) as fin, open("URL Cleaned.txt", "wt") as fout:
lines = fin.readlines():
bases = set(re.search(p, line).groups()[0] for line in lines if len(line) > 1)
for b in bases:
fout.write(b)
Using with open(..) auto closes the files after the executing the block of code
Output:
Using a text file with:
www.example.com/example1
www.example.com/example2
# blank lines are accounted for
www.example.com/example3
www.example.com/example4
www.example.com/example4 # as are duplicates
as the lines, I got the output,
example1
example2
example3
example4
I have to use the command prompt and python to recieve an input in the form of a csv file, then read it and convert it into a xml file with the same name as the csv file except with .xml file extension or the user can set the ouput file name and path using the -o --output optional command line argument. Well i have searched on google for days, and so far my program allows me to input command line arguments and i can convert the csv to an xml file but it doesn't print it using the same name as the csv file or when the user sets the name. Instead it just prints out a blank file. Here is my code:
import sys, argparse
import csv
import indent
from xml.etree.ElementTree import ElementTree, Element, SubElement, Comment, tostring
parser=argparse.ArgumentParser(description='Convert wordlist text files to various formats.', prog='Text Converter')
parser.add_argument('-v','--verbose',action='store_true',dest='verbose',help='Increases messages being printed to stdout')
parser.add_argument('-c','--csv',action='store_true',dest='readcsv',help='Reads CSV file and converts to XML file with same name')
parser.add_argument('-x','--xml',action='store_true',dest='toxml',help='Convert CSV to XML with different name')
parser.add_argument('-i','--inputfile',type=argparse.FileType('r'),dest='inputfile',help='Name of file to be imported',required=True)
parser.add_argument('-o','--outputfile',type=argparse.FileType('w'),dest='outputfile',help='Output file name')
args = parser.parse_args()
def main(argv):
reader = read_csv()
if args.verbose:
print ('Verbose Selected')
if args.toxml:
if args.verbose:
print ('Convert to XML Selected')
generate_xml(reader)
if args.readcsv:
if args.verbose:
print ('Reading CSV file')
read_csv()
if not (args.toxml or args.readcsv):
parser.error('No action requested')
return 1
def read_csv():
with open ('1250_12.csv', 'r') as data:
return list(csv.reader(data))
def generate_xml(reader):
root = Element('Solution')
root.set('version','1.0')
tree = ElementTree(root)
head = SubElement(root, 'DrillHoles')
head.set('total_holes', '238')
description = SubElement(head,'description')
current_group = None
i = 0
for row in reader:
if i > 0:
x1,y1,z1,x2,y2,z2,cost = row
if current_group is None or i != current_group.text:
current_group = SubElement(description, 'hole',{'hole_id':"%s"%i})
collar = SubElement (current_group, 'collar',{'':', '.join((x1,y1,z1))}),
toe = SubElement (current_group, 'toe',{'':', '.join((x2,y2,z2))})
cost = SubElement(current_group, 'cost',{'':cost})
i+=1
indent.indent(root)
tree.write(open('hole.xml','w'))
if (__name__ == "__main__"):
sys.exit(main(sys.argv))
for the generate_xml() function, you can ignore it since it accepts csv files formatted a certain way so you might not understand it but, i think the problem lies in tree.write() since that part generates the xml file with a name that is written in the code itself and not the arguments at the command prompt.
You need to pass a file argument to generate_xml(). You appear to have the output file in args.outputfile.
generate_xml(reader, args.outputfile)
...
def generate_xml(reader, outfile):
...
tree.write(outfile)
You should probably also make use of args.inputfile:
reader = read_csv(args.inputfile)
...
def read_csv(inputfile):
return list(csv.reader(inputfile))
And this line does not do anything useful, it processes the .csv file, but doesn't do anything with the results:
read_csv()
The following code has been adapted from FB36's recipie on code.activestate.com
It will do what you need and you don't have to worry about the headers in the csv file, though there should only be one header (the first row) in the csv file. Have a look at the bottom of this page if you want to do batch conversion.
'''Convert csv to xml file
csv2xml.py takes two arguments:
1. csvFile: name of the csv file (may need to specify path to file)
2. xmlFile: name of the desired xml file (path to destination can be specified)
If only the csv file is provided, its name is used for the xml file.
Command line usage:
example1: python csv2xml.py 'fileName.csv' 'desiredName.xml'
example2: python csv2xml.py '/Documents/fileName.csv' '/NewFolder/desiredName.xml'
example3: python csv2xml.py 'fileName.csv'
This code has been adapted from: http://code.activestate.com/recipes/577423/
'''
import csv
def converter(csvFile, xmlFile):
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<csv_data>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', '_')
else:
xmlData.write('<row>' + "\n")
for i in range(len(tags)):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</row>' + "\n")
rowNum +=1
xmlData.write('</csv_data>' + "\n")
xmlData.close()
## for using csv2xml.py from the command line
if __name__ == '__main__':
import sys
if len(sys.argv)==2:
import os
csvFile = sys.argv[1]
xmlFile = os.path.splitext(csvFile)[0] + '.xml'
converter(csvFile,xmlFile)
elif len(sys.argv)==3:
csvFile = sys.argv[1]
xmlFile = sys.argv[2]
converter(csvFile,xmlFile)
else:
print __doc__
I have written a code which opens multiple files in a directory and prints only the first instance of match of required text from each file as output.
Now I want this output in a file. Doing it simply by putting print >> file.txt,... or .write or csv.write inside loop will not serve the purpose.
My code is:
import re, os, csv, sys
path = "D:\\"
in_files = os.listdir(path)
moldesc = ['Match1', 'Match2']
for f in in_files:
file = os.path.join(path, f)
text = open(file, "r")
for line in text:
if moldesc[0] in line:
Text1 = line.split()[-1]
if moldesc[1] in line:
Text2 = line.split()[-1]
print f, Text1, Text2 # I WANT THIS OUTPUT IN A FILE
break
text.close()
print "We are extraction done !!!"
you managed to open a file for reading, that's just one step away from opening a file for writing.
out = open(outfile, "w")
for f in in_files:
...
output_string = "{},{},{}\n".format(f, HOMO, LUMO)
out.write(output_string)