Remove duplicates after altering items - python

I have a script to clean urls to get base domains from example.com/example1 and example.com/example2 down to example.com My issue is when it goes to through the file of urls it will have duplicate base domains. I want to remove the duplicates while printing the urls to a file. below is the code I currently have.
enter from Tkinter import *
import tkFileDialog
import re
def main():
fileOpen = Tk()
fileOpen.withdraw() #hiding tkinter window
file_path = tkFileDialog.askopenfilename(
title="Open file", filetypes=[("txt file",".txt")])
if file_path != "":
print "you chose file with path:", file_path
else:
print "you didn't open anything!"
fin = open(file_path)
fout = open("URL Cleaned.txt", "wt")
for line in fin.readlines():
editor = (line.replace('[.]', '.')
.replace('[dot]', '.')
.replace('hxxp://www.', '')
.replace('hxxps://www.', '')
.replace('hxxps://', '')
.replace('hxxp://', '')
.replace('www.', '')
.replace('http://www.', '')
.replace('https://www.', '')
.replace('https://', '')
.replace('http://', ''))
editor = re.sub(r'/.*', '', editor)
if __name__ == '__main__':
main()
Any help is appreciated. I have scoured the posts and tried all of the suggestions for my issue and have not found one that works.

You can use regular expresion to find the base domains.
If you have one url per line in your file:
import re
def main():
file = open("url.txt",'r')
domains = set()
# will works for any web like https://www.domain.com/something/somethingmore... , also without www, without https or just for www.domain.org
matcher= re.compile("(h..ps?://)?(?P<domain>(www\.)?[^/]*)/?.*")
for line in file:
# make here any replace you need with obfuscated urls like: line = line.replace('[.]','.')
if line[-1] == '\n': # remove "\n" from end of line if present
line = line[0:-1]
match = matcher.search(line)
if match != None: # If a url has been found
domains.add(match.group('domain'))
print domains
file.close()
main()
For example, with this file, it will print:
set(['platinum-shakers.net', 'wmi.ns01.us', 'adservice.no-ip.org', 'samczeruno.pl', 'java.ns1.name', 'microsoft.dhcp.biz', 'ids.us01.us', 'devsite.quostar.com', 'orlandmart.com'])

perhaps you could use a regular expression:
import re
p = re.compile(r".*\.com/(.*)") # to get for instance 'example1' or 'example2' etc.
with open(file_path) as fin, open("URL Cleaned.txt", "wt") as fout:
lines = fin.readlines():
bases = set(re.search(p, line).groups()[0] for line in lines if len(line) > 1)
for b in bases:
fout.write(b)
Using with open(..) auto closes the files after the executing the block of code
Output:
Using a text file with:
www.example.com/example1
www.example.com/example2
# blank lines are accounted for
www.example.com/example3
www.example.com/example4
www.example.com/example4 # as are duplicates
as the lines, I got the output,
example1
example2
example3
example4

Related

How can I delete "\n" lines from a file in Python?

I need to check if the .csv file I'm working with ends with more than 1 "\n" line. If it finds more than a blank line, it removes them all but one.
My code is:
import os
from pathlib import Path
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r+") as op:
lines = op.readlines()
for line in lines:
if line == "\n":
op.write(line.rstrip("\n"))
The .csv file is something like ['01-01-2019,0,0,0\n', '18-05-2019,33,31,48\n', '\n', '\n', '\n'] and the output I'd want is ['01-01-2019,0,0,0\n', '18-05-2019,33,31,48\n', '\n'] but it doesn't seem to be able to delete any line.
The simplest way would be to keep track if you've seen an empty line, then write one just before you write a non-empty line.
pre = ""
for line in lines:
if line == "\n":
pre = line
else:
op.write(pre)
op.write(line)
pre = "\n"
op.write(pre)
This reduces any sequence of empty lines to a single empty line, and writes that single line just before writing a non-empty line or the end of the file. When pre is the empty string, writing it is a no-op.
If you want to preserve multiple blank lines in the middle of the file, build up the sequence of blank lines in pre as you find them, and at the end of the file, only write a single blank line (rather than pre itself) if pre is not empty.
pre = ""
for line in lines:
if line == "\n":
pre += line
else:
op.write(pre)
op.write(line)
pre = ""
if pre:
op.write("\n")
Oops, never rewrite the file that you are reading: it is likely not to work or at best will lead to a maintenance nightmare.
If the file is small enough to fit in main memory, this slight change in your code could be enough:
import os.path
from pathlib import Path
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r") as op:
lines = op.readlines() # read lines in memory
with open(path("w") as op: # re-write everything from the beginning
flag = False
for line in lines:
if line == "\n":
if not flag:
op.write(line)
flag = True
else:
op.write(line)
# flag = False # uncomment if you want to keep one blank line
# per group of consecutive lines
You could try using the Counter().
import os
from pathlib import Path
from collections import Counter
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r+") as op:
lines = op.readlines()
for line in lines:
count = Counter()
# Add 1 for every time word appears in line
for word in line:
count[word] += 1
# Change the number of newlines to 1
if count['\n'] > 1:
count['\n'] = 1
# Returns list with the number of elements
line = list(count.elements())
I managed to work this out, with this code:
import os
from pathlib import Path
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r") as op:
lines = op.readlines() # read lines in memory
with open(path, "w") as op: # re-write everything from the beginning
for line in lines:
if line != "\n":
op.write(line)
else:
continue
It can remove every new line in excess, no matter where it is in the file.
Thanks to everyone who tried to help me!

How to edit specific line for all text files in a folder by python?

Here below is my code about how to edit text file.
Since python can't just edit a line and save it at the same time,
I save the previous text file's content into a list first then write it out.
For example,if there are two text files called sample1.txt and sample2.txt in the same folder.
Sample1.txt
A for apple.
Second line.
Third line.
Sample2.txt
First line.
An apple a day.
Third line.
Execute python
import glob
import os
#search all text files which are in the same folder with python script
path = os.path.dirname(os.path.abspath(__file__))
txtlist = glob.glob(path + '\*.txt')
for file in txtlist:
fp1 = open(file, 'r+')
strings = [] #create a list to store the content
for line in fp1:
if 'apple' in line:
strings.append('banana\n') #change the content and store into list
else:
strings.append(line) #store the contents did not be changed
fp2 = open (file, 'w+') # rewrite the original text files
for line in strings:
fp2.write(line)
fp1.close()
fp2.close()
Sample1.txt
banana
Second line.
Third line.
Sample2.txt
First line.
banana
Third line.
That's how I edit specific line for text file.
My question is : Is there any method can do the same thing?
Like using the other functions or using the other data type rather than list.
Thank you everyone.
Simplify it to this:
with open(fname) as f:
content = f.readlines()
content = ['banana' if line.find('apple') != -1 else line for line in content]
and then write value of content to file back.
Instead of putting all the lines in a list and writing it, you can read it into memory, replace, and write it using same file.
def replace_word(filename):
with open(filename, 'r') as file:
data = file.read()
data = data.replace('word1', 'word2')
with open(filename, 'w') as file:
file.write(data)
Then you can loop through all of your files and apply this function
The built-in fileinput module makes this quite simple:
import fileinput
import glob
with fileinput.input(files=glob.glob('*.txt'), inplace=True) as files:
for line in files:
if 'apple' in line:
print('banana')
else:
print(line, end='')
fileinput redirects print into the active file.
import glob
import os
def replace_line(file_path, replace_table: dict) -> None:
list_lines = []
need_rewrite = False
with open(file_path, 'r') as f:
for line in f:
flag_rewrite = False
for key, new_val in replace_table.items():
if key in line:
list_lines.append(new_val+'\n')
flag_rewrite = True
need_rewrite = True
break # only replace first find the words.
if not flag_rewrite:
list_lines.append(line)
if not need_rewrite:
return
with open(file_path, 'w') as f:
[f.write(line) for line in list_lines]
if __name__ == '__main__':
work_dir = os.path.dirname(os.path.abspath(__file__))
txt_list = glob.glob(work_dir + '/*.txt')
replace_dict = dict(apple='banana', orange='grape')
for txt_path in txt_list:
replace_line(txt_path, replace_dict)

How to replace a string in a file?

I have 2 numbers in two similar files. There is a new.txt and original.txt. They both have the same string in them except for a number. The new.txt has a string that says boothNumber="3". The original.txt has a string that says boothNumber="1".
I want to be able to read the new.txt, pick the number 3 out of it and replace the number 1 in original.txt.
Any suggestions? Here is what I am trying.
import re # used to replace string
import sys # some of these are use for other code in my program
def readconfig():
with open("new.text") as f:
with open("original.txt", "w") as f1:
for line in f:
match = re.search(r'(?<=boothNumber=")\d+', line)
for line in f1:
pattern = re.search(r'(?<=boothNumber=")\d+', line)
if re.search(pattern, line):
sys.stdout.write(re.sub(pattern, match, line))
When I run this, my original.txt gets completely cleared of any text.
I did a traceback and I get this:
in readconfig
for line in f1:
io.UnsupportedOperationo: not readable
UPDATE
I tried:
def readconfig(original_txt_path="original.txt",
new_txt_path="new.txt"):
with open(new_txt_path) as f:
for line in f:
if not ('boothNumber=') in line:
continue
booth_number = int(line.replace('boothNumber=', ''))
# do we need check if there are more than one 'boothNumber=...' line?
break
with open(original_txt_path) as f1:
modified_lines = [line.startswith('boothNumber=') if not line
else 'boothNumber={}'.format(booth_number)
for line in f1]
with open(original_txt_path, mode='w') as f1:
f1.writelines(modified_lines)
And I get error:
booth_number = int(line.replace('boothNumber=', ''))
ValueError: invalid literal for int() with base 10: '
(workstationID="1" "1" window=1= area="" extra parts of the line here)\n
the "1" after workstationID="1" is where the boothNumber=" " would normally go. When I open up original.txt, I see that it actually did not change anything.
UPDATE 3
Here is my code in full. Note, the file names are changed but I'm still trying to do the same thing. This is another idea or revision I had that is still not working:
import os
import shutil
import fileinput
import re # used to replace string
import sys # prevents extra lines being inputed in config
# example: sys.stdout.write
def convertconfig(pattern):
source = "template.config"
with fileinput.FileInput(source, inplace=True, backup='.bak') as file:
for line in file:
match = r'(?<=boothNumber=")\d+'
sys.stdout.write(re.sub(match, pattern, line))
def readconfig():
source = "bingo.config"
pattern = r'(?<=boothNumber=")\d+' # !!!!!!!!!! This probably needs fixed
with fileinput.FileInput(source, inplace=True, backup='.bak') as file:
for line in file:
if re.search(pattern, line):
fileinput.close()
convertconfig(pattern)
def copyfrom(servername):
source = r'//' + servername + '/c$/remotedirectory'
dest = r"C:/myprogram"
file = "bingo.config"
try:
shutil.copyfile(os.path.join(source, file), os.path.join(dest, file))
except:
print ("Error")
readconfig()
# begin here
os.system('cls' if os.name == 'nt' else 'clear')
array = []
with open("serverlist.txt", "r") as f:
for servername in f:
copyfrom(servername.strip())
bingo.config is my new file
template.config is my original
It's replacing the number in template.config with the literal string "r'(?<=boothNumber=")\d+'"
So template.config ends up looking like
boothNumber="r'(?<=boothNumber=")\d+'"
instead of
boothNumber="2"
To find boothNumber value we can use next regular expression (checked with regex101)
(?<=\sboothNumber=\")(\d+)(?=\")
Something like this should work
import re
import sys # some of these are use for other code in my program
BOOTH_NUMBER_RE = re.compile('(?<=\sboothNumber=\")(\d+)(?=\")')
search_booth_number = BOOTH_NUMBER_RE.search
replace_booth_number = BOOTH_NUMBER_RE.sub
def readconfig(original_txt_path="original.txt",
new_txt_path="new.txt"):
with open(new_txt_path) as f:
for line in f:
search_res = search_booth_number(line)
if search_res is None:
continue
booth_number = int(search_res.group(0))
# do we need check if there are more than one 'boothNumber=...' line?
break
else:
# no 'boothNumber=...' line was found, so next lines will fail,
# maybe we should raise exception like
# raise Exception('no line starting with "boothNumber" was found')
# or assign some default value
# booth_number = -1
# or just return?
return
with open(original_txt_path) as f:
modified_lines = []
for line in f:
search_res = search_booth_number(line)
if search_res is not None:
line = replace_booth_number(str(booth_number), line)
modified_lines.append(line)
with open(original_txt_path, mode='w') as f:
f.writelines(modified_lines)
Test
# Preparation
with open('new.txt', mode='w') as f:
f.write('some\n')
f.write('<jack Fill workstationID="1" boothNumber="56565" window="17" Code="" area="" section="" location="" touchScreen="False" secureWorkstation="false">')
with open('original.txt', mode='w') as f:
f.write('other\n')
f.write('<jack Fill workstationID="1" boothNumber="23" window="17" Code="" area="" section="" location="" touchScreen="False" secureWorkstation="false">')
# Invocation
readconfig()
# Checking output
with open('original.txt') as f:
for line in f:
# stripping newline character
print(line.rstrip('\n'))
gives
other
<jack Fill workstationID="1" boothNumber="56565" window="17" Code="" area="" section="" location="" touchScreen="False" secureWorkstation="false">

How do I search a file for a string and replace it with multiple lines in Python?

I am running Python 3.5.1
I have a text file that I'm trying to search through and replace or overwrite text if it matches a predefined variable. Below is a simple example:
test2.txt
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
Outdated line of information that has no comment above - message_label
The last line in this example needs to be overwritten so the new file looks like below:
test2.txt after script
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
# This is an important line that needs to be copied
Very Important Line of information that the above line is a comment for - message_label
The function I have written idealAppend does not work as intended and subsequent executions create a bit of a mess. My workaround has been to separate the two lines into single line variables but this doesn't scale well. I want to use this function throughout my script with the ability to handle any number of lines. (if that makes sense)
Script
#!/usr/bin/env python3
import sys, fileinput, os
def main():
file = 'test2.txt'
fullData = r'''
# This is an important line that needs to be copied
Very Important Line of information that the above line is a comment for - message_label
'''
idealAppend(file, fullData)
def idealAppend(filename, data):
label = data.split()[-1] # Grab last word of the Append String
for line in fileinput.input(filename, inplace=1, backup='.bak'):
if line.strip().endswith(label) and line != data: # If a line 2 exists that matches the last word (label)
line = data # Overwrite with new line, comment, new line, and append data.
sys.stdout.write(line) # Write changes to current line
with open(filename, 'r+') as file: # Open File with rw permissions
line_found = any(data in line for line in file) # Search if Append exists in file
if not line_found: # If data does NOT exist
file.seek(0, os.SEEK_END) # Goes to last line of the file
file.write(data) # Write data to the end of the file
if __name__ == "__main__": main()
Workaround Script
This seems to work perfectly as long as I only need to write exactly two lines. I'd love this to be more dynamic when it comes to number of lines so I can reuse the function easily.
#!/usr/bin/env python3
import sys, fileinput, os
def main():
file = 'test2.txt'
comment = r'# This is an important line that needs to be copied'
append = r'Very Important Line of information that the above line is a comment for - message_label'
appendFile(file, comment, append)
def appendFile(filename, comment, append):
label = append.split()[-1] # Grab last word of the Append String
for line in fileinput.input(filename, inplace=1, backup='.bak'):
if line.strip().endswith(label) and line != append: # If a line 2 exists that matches the last word (label)
line = '\n' + comment + '\n' + append # Overwrite with new line, comment, new line, and append data.
sys.stdout.write(line) # Write changes to current line
with open(filename, 'r+') as file: # Open File with rw permissions
line_found = any(append in line for line in file) # Search if Append exists in file
if not line_found: # If data does NOT exist
file.seek(0, os.SEEK_END) # Goes to last line of the file
file.write('\n' + comment + '\n' + append) # Write data to the end of the file
if __name__ == "__main__": main()
I am very new to Python so I'm hoping there is a simple solution that I overlooked. I thought it might make sense to try and split the fullData variable at the new line characters into a list or tuple, filter the label from the last item in the list, then output all entries but this is starting to move beyond what I've learned so far.
If I understand your issue correctly, you can just open the input and output files, then check whether the line contains old information and ends with the label and write the appropriate content accordingly.
with open('in.txt') as f, open('out.txt', 'r') as output:
for line in f:
if line.endswith(label) and not line.startswith(new_info):
output.write(replacement_text)
else:
output.write(line)
If you want to update the original file instead of creating a second one, it's easiest to just delete the original and rename the new one instead of trying to modify it in place.
Is this what you are looking for ? It's looking for a label and then replaces the whole line with whatever you want.
test2.txt
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
Here is to be replaced - to_replace
script.py
#!/usr/bin/env python3
def main():
file = 'test2.txt'
label_to_modify = "to_replace"
replace_with = "# Blabla\nMultiline\nHello"
"""
# Raw string stored in a file
file_replace_with = 'replace_with.txt'
with open(file_replace_with, 'r') as f:
replace_with = f.read()
"""
appendFile(file, label_to_modify, replace_with)
def appendFile(filename, label_to_modify, replace_with):
new_file = []
with open(filename, 'r') as f:
for line in f:
if len(line.split()) > 0 and line.split()[-1] == label_to_modify:
new_file.append(replace_with)
else:
new_file.append(line)
with open(filename + ".bak", 'w') as f:
f.write(''.join(new_file))
if __name__ == "__main__": main()
test2.txt.bak
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
# Blabla
Multiline
Hello
Reading over both answers I've come up with the following as the best solution i can get to work. It seems to do everything I need. Thanks Everyone.
#!/usr/bin/env python3
def main():
testConfFile = 'test2.txt' # /etc/apache2/apache2.conf
testConfLabel = 'timed_combined'
testConfData = r'''###This is an important line that needs to be copied - ##-#-####
Very Important Line of information that the above line is a \"r\" comment for - message_label'''
testFormatAppend(testConfFile, testConfData, testConfLabel) # Add new test format
def testFormatAppend(filename, data, label):
dataSplit = data.splitlines()
fileDataStr = ''
with open(filename, 'r') as file:
fileData = stringToDictByLine(file)
for key, val in fileData.items():
for row in dataSplit:
if val.strip().endswith(row.strip().split()[-1]):
fileData[key] = ''
fileLen = len(fileData)
if fileData[fileLen] == '':
fileLen += 1
fileData[fileLen] = data
else:
fileLen += 1
fileData[fileLen] = '\n' + data
for key, val in fileData.items():
fileDataStr += val
with open(filename, 'w') as file:
file.writelines(str(fileDataStr))
def stringToDictByLine(data):
fileData = {}
i = 1
for line in data:
fileData[i] = line
i += 1
return fileData
if __name__ == "__main__": main()

find and replace content between two words in POM

Need a bash script or python script to find and replace text between two tags?
E.g:
<start>text to find and replace with the one I give as input<end>
' text to find and replace with the one I give as input' is just an example and it could vary every time.
I want to do something like ./changetxt inputfile.xxx newtext
where changetxt has the script;
inputfile.xxx has the text that needs a change and newtext is what goes into inputfile.xxx
python:
import sys
if __name__ == "__main__":
#ajust these to your need
starttag = "<foo>"
endtag = "</foo>"
inputfilename = sys.argv[1]
outputfilename = inputfilename + ".out"
replacestr = sys.argv[2]
#open the inputfile from the first argument
inputfile = open(inputfilename, 'r')
#open an outputfile to put the result in
outputfile = open(outputfilename, 'w')
#test every line in the file for the starttag
for line in inputfile:
if starttag in line and endtag in line:
#compose a new line with the replaced string
newline = line[:line.find(starttag) + len(starttag)] + replacestr + line[line.find(endtag):]
#and write the new line to the outputfile
outputfile.write(newline)
else:
outputfile.write(line)
outputfile.close()
inputfile.close()
Save this in a replacetext.py file and run as python replacetext.py \path\to\inputfile "I want this text between the tags"
You could also use BeautifulSoup for this, from their docs:
If you set a tag’s .string attribute, the tag’s contents are replaced
with the string you give:
markup = 'I linked to <i>example.com</i>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.string = "New link text."
tag
# New link text.

Categories

Resources