Searching words from a file in another file - python

I have got 2 files:
access.log.13 : a simple access log from a web server.
bots.txt : that contains spider's and crawlers names, each one in a different line, for example: googlebot mj12bot baidu etc etc
I would like to create a third file "hits.txt" with all the lines from "access.log.13" that contains any of the words from the file "spiders.txt"
This is my little Frankeinstein:
file_working = file("hits.txt", "wt")
file_1_logs = open("access.log.13", "r")
file_2_bots = open("bots.txt", "r")
file_3_hits = open("hits.txt", "a")
list_1 = arxiu_1_logs.readlines()
list_2 = arxiu_2_bots.readlines()
file_3_hits.write("Lines with bots: \n \n")
for i in list_2:
for j in list_1:
if i in j:
file_3_hits.write(j)
arxiu_1_logs.close()
arxiu_2_bots.close()
It doesn't work as i would like cause i only get hits when the line in bots.txt is exactly the same than any line in access.log.13. Thx

You can do it in a more pythonish way:
with open('spiders.txt') as fh:
words = set(re.split(r'[ \n\r]+', fh.read())) # set of searched words
with open('access.log.13') as file_in, \
open('hits.txt', 'w') as file_out:
for line in file_in:
if any(word in line for word in words): # look for any of the words
file_out.write(line)
Or you can use even nicer comprehension:
with open(...) as file_in, open (...) as file_out: # same as previously
good_lines = (line for line in file_in if any(word in line for word in words))
for good_line in good_lines:
file_out.write(good_line)

Replace the if with this:
if j.find(i) != -1

Related

Remove a word from a file

I am try to sort through the following file
Fantasy
Supernatural
Fantasy
UrbanFantasy
Fantasy
EpicFantasy
Fantasy
HighFantasy
I want to remove the word fantasy when it appears by itself and put the new list into another file
I tried
def getRidofFantasy():
file = open("FantasyGenres.txt", "r")
new_file = open("genres/fantasy", "w")
for line in file:
if line != "Fantasy":
new_file.write(line)
file.close()
new_file.close()
This does not work and I am at a lost as to why. The new file is the same as the old one. Can anyone explain what's happening and give an example of the correct solution?
Try this
with open('fantasy.txt') as f, open('generes/fantasy', 'w') as nf:
lines = [line+'\n' for line in f.read().splitlines() if line != "Fantasy"]
nf.writelines(lines)
In your code when you do for line in f the line variable also include the \n (endline) char, that's why it doesn't work.
Try this. -
def getRidofFantasy():
with open("FantasyGenres.txt", "r") as file:
content = [line.strip('\n') for line in file.readlines()]
new_list = list(filter(lambda a: a != 'Fantasy', content))
with open("genres/fantasy.txt", "w") as new_file:
[new_file.write(f'{line}\n') for line in new_list]
getRidofFantasy()
Similar to #Atin's answer, you can also do this:
with open('fantasy.txt') as f, open('generes/fantasy', 'w') as nf:
lines = [line for line in f.readlines() if line.strip() != "Fantasy"]
nf.writelines(lines)
That is because a new line is also a character:
Fantasy\n
Supernatural\n
etc.
You have to account for that. One possibility:
def getRidofFantasy():
with open("FantasyGenres.txt", "r") as f: # this way Python closes the file buffer for you
oldfile = f.readlines()
new_file = open("genres/fantasy", "w")
for line in oldfile:
line = line.rstrip('\n')
if line != "Fantasy":
new_file.write(line+'\n') # make sure to append the newline character again
new_file.close()
Okay so there is one thing you should know. When you read a line like that the variable will look something like this:-
line='Fantasy\n'
So, you need to strip that character. The simple solution without changing any of your code would be to just change the if statement. Change it to
if not 'Fantasy'== line.strip() and keep your code as it is and the new file that'll be generated will be the one you want.

I need to search a string in data file. but the string is written in another file

I have key words to be search in one file let say abc.txt and in another file I have my data, def.txt.
I want a code in python to find key words written in abc.txt, in def.txt and if present, print those line in a new file.
Thank you.
I tried writing a code but it didn't work.
following is the code I write.
f = open('/home/vivek/Documents/abc.txt')
f1 = open('output.txt', 'a')
f2 = open('/home/vivek/Documents/def.txt', 'r')
# doIHaveToCopyTheLine=False
for line in f.readlines():
if f2 in line:
f1.write(line)
f1.close()
f.close()
f2.close()
Load the keywords into a list then you can check the other file line-by-line, and write to outfile as you find keywords in the line.
with open('/path/to/keywords.txt') as f:
keywords = set(line.strip() for line in f) # assuming words are separated by line
with open('/path/to/search_me.txt') as f, open('/path/to/outfile.txt', 'w') as outfile:
for line in f:
if any(kw in line for kw in keywords):
outfile.write(line)
You should record all the words in abc.txt use a set and then search them in def.txt
word_set = set()
with open('/home/vivek/Documents/abc.txt') as f:
for line in f:
word_set.add(line.strip())
f1 = open('output.txt', 'a')
with open('/home/vivek/Documents/def.txt') as f:
for line in f:
find = False
for word in word_set:
if word in line:
find = True
break
if find:
f1.write(line)
f1.close()
You can try this code:
with open("keyword.txt", "r") as keyword_file:
keywords = keyword_file.read().strip()
keywords = keywords.split()
with open("data.txt", "r") as data_file, open("output.txt", "w") as output_file:
for line in data_file.readlines():
line = line.strip()
for word in keywords:
if line.find(word) != -1:
print line
output_file.writelines(line + '\n')
break
In addition to sytech's answer you may try this:
with open('def.txt') as kw_obj, open('abc.txt') as in_obj:
keywords = set(kw_obj.read().split())
in_lines = in_obj.readlines()
match_lines = [line for keyword in keywords for line in in_lines if keyword in line]
if match_lines:
with open('out.txt', 'w') as out:
out.write(''.join(match_lines))

writing output on file doesn't work in python

I have the code below to write out a list of N-grams in Python.
from nltk.util import ngrams
def word_grams(words, min=1, max=6):
s = []
for n in range(min, max):
for ngram in ngrams(words, n):
s.append(' '.join(str(i) for i in ngram))
return s
email = open("output.txt", "r")
for line in email.readlines():
with open('file.txt', 'w') as f:
for line in email:
prnt = word_grams(email.split(' '))
f.write("prnt")
email.close()
f.close()
when I print out the word_grams it prints out the files correctly but when it comes to writing the output into files.txt it doesn't work. The "file.txt" is empty.
So I guess the problem must be within these lines of codes:
for line in email.readlines():
with open('file.txt', 'w') as f:
for line in email:
prnt = word_grams(email.split(' '))
f.write("prnt")
email.close()
f.close()
1) the final f.close() does something else than what you want (f inside the loop is another object)
2) You name the file "file.txt" but want the output in "files.txt". Are you sure that you are looking in a correct file?
3) You are overwriting the file for each line in the email. Perhaps the with statement for "file.txt" should be outside the loop.
4) You are writing "prnt" instead of prnt
Something like this?
def word_grams(words, min=1, max=6):
s = []
for n in range(min, max):
for ngram in ngrams(words, n):
s.append(' '.join(str(i) for i in ngram))
return s
with open("output.txt", "r") as email:
with open('file.txt', 'w') as f:
for line in email.readlines():
prnt = word_grams(line.split(' '))
for ngram in prnt:
f.write(ngram)
I don't know what you are trying to accomplish exactly, but it seems that you would like to apply the function word_grams to every word in the file "output.txt" and save the output to a file called "file.txt", probably one item per line.
With these assumptions, I would recommend to rewrite your iteration in this manner:
words = []
# load words from input
with open("output.txt") as f:
for line in f:
words += line.strip().split(" ")
# generate and save output
grams = apply(word_grams, words)
with open("file.txt", "w") as f:
f.write("\n".join(grams))
However, this code assumes that the function word_grams is working properly.
Your code in loop:
for line in email:
did not run!
Because after email.readlines()run,the variable email is empty.
You can do some test like fallows:
email = open("output.txt", "r")
for line in email.readlines():
print '1'
for line in email:
print '2'
if you have 3 lines in your output.txt,after you run this test,you will get:
1
1
1
in the output.
And you can do a test like this:
email = open("output.txt", "r")
email.readlines()
you will see a list with the lines in your output.txt.
but when you run email.readlines()again,you will get an empty list!
so,there should be the problem.your variable email is empty in your second loop.

How to delete lines in a file in Python?

I have a text file file that looks like this:
people 0.508931508057 -0.280345656093 -0.0318199105583 -0.189979892892 0.00748802665945 -0.0570929853912 0.0688883067716 0.187604694632 0.114414087961 0.150298183734
well 0.634085165013 -0.130742033765 0.0833007355449 -0.304469830925 0.133714906135 -0.0221626440854 0.062845160898 0.0607120405012 0.0384326647526 -0.0102762686058
it
0.451455675985 -0.0309283486444 -0.233415252863 -0.0273732833795 -0.294310277236 0.324236481567 -0.084486587459 0.340305398253 -0.56250445207 0.00640281538272
but 0.776732251824 0.0216479978956 0.326422159918 0.0654654707123 0.235569019918 0.0792330670559 0.22189299375 0.194232853917 0.102964793215 0.00926554861178
could 0.505766726467 -0.304640132821 0.015043924871 -0.42831149929 0.13475950648 0.0275223466164 0.154347034425 0.443048319277 0.229038343902 -0.209763506494
think 0.734314690035 -0.15352368041 0.383964369466 -0.283262375383 0.000534210123265 0.0452656078196 0.0174349360274 -0.0210130687293 0.0247592836651 0.0930452272721
movie
0.444291696176 -0.110937149049 -0.259525377532 0.00986849685667 -0.311934727067 0.319610517473 -0.0644468651461 0.372562407 -0.572686043624 0.0262434708424
made 0.546164908581 -0.148512160184 0.301391306124 -0.553970562504 -0.0423941756245 -0.0789194920559 -0.0336542251386 0.00929984630184 -0.030340761377 -0.112650323493
way 0.751616772605 -0.345057880564 0.10091886809 -0.147689086912 -0.0721519520719 -0.246317313253 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062
I want to keep in the file only the lines that contain both a word and a set of values on the same line.
How can I delete the rest of them?
The expected output is:
people 0.508931508057 -0.280345656093 -0.0318199105583 -0.189979892892 0.00748802665945 -0.0570929853912 0.0688883067716 0.187604694632 0.114414087961 0.150298183734
well 0.634085165013 -0.130742033765 0.0833007355449 -0.304469830925 0.133714906135 -0.0221626440854 0.062845160898 0.0607120405012 0.0384326647526 -0.0102762686058
but 0.776732251824 0.0216479978956 0.326422159918 0.0654654707123 0.235569019918 0.0792330670559 0.22189299375 0.194232853917 0.102964793215 0.00926554861178
could 0.505766726467 -0.304640132821 0.015043924871 -0.42831149929 0.13475950648 0.0275223466164 0.154347034425 0.443048319277 0.229038343902 -0.209763506494
think 0.734314690035 -0.15352368041 0.383964369466 -0.283262375383 0.000534210123265 0.0452656078196 0.0174349360274 -0.0210130687293 0.0247592836651 0.0930452272721
made 0.546164908581 -0.148512160184 0.301391306124 -0.553970562504 -0.0423941756245 -0.0789194920559 -0.0336542251386 0.00929984630184 -0.030340761377 -0.112650323493
way 0.751616772605 -0.345057880564 0.10091886809 -0.147689086912 -0.0721519520719 -0.246317313253 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062
A few ways to solve the problem.
Read the file as CSV. If the column count is 12 and the first column is not a blank string, write it out:
import csv
with open('original.txt','r') as f, open('new.txt','w') as o:
reader = csv.reader(f, delimiter=' ')
writer = csv.writer(o, delimiter=' ')
for row in reader:
if len(row) == 12 and row[0]:
writer.write(row)
Read the file, write out those lines where the first item is a word and there are more than 2 columns:
with open('original.txt', 'r') as f, open('new.txt', 'w') as o:
for line in f:
if line.lstrip().split(' ')[0].isalpha() and len(line.split(' ')) > 2:
o.write(line)
You can implement this very nicely using the fileinput module:
import fileinput
import sys
for line in fileinput.input(inplace=True):
if line.find(' ') > 0:
sys.stdout.write(line)
Note that this modifies all the files given on the command line in-place, i.e. they will be modified.
You can open the file and read it line by line. Each line that contains a letter can be copied to another file.
letters = "abcdefghijklmnopqrstuvwxyz"
letters += letters.upper()
f_in = open("myfile.txt", 'rb')
f_out = open("newfile.txt", 'wb')
for line in f_in:
for letter in letters:
if letter in line:
f_out.write(line)
break
f_in.close()
f_out.close()
Be careful, the example you have given contains only one long line.
This line has to be split into multiple lines.
Try this. Hope this gives you the desired result.
import re
f = open("i1.txt", "r")
lines = f.readlines()
new_list = []
for items in lines:
if re.match("[a-zA-Z]+[\s\d]+", items.strip()) or re.match("[\d.\s]+[a-zA-Z]+[\s\d.]*", items.strip()):
print items.strip()
The following is the output
people 0.508931508057 -0.280345656093 -0.0318199105583 -0.189979892892 0.00748802665945 -0.0570929853912 0.0688883067716 0.187604694632 0.114414087961 0.150298183734
well 0.634085165013 -0.130742033765 0.0833007355449 -0.304469830925 0.133714906135 -0.0221626440854 0.062845160898 0.0607120405012 0.0384326647526 -0.0102762686058
but 0.776732251824 0.0216479978956 0.326422159918 0.0654654707123 0.235569019918 0.0792330670559 0.22189299375 0.194232853917 0.102964793215 0.00926554861178
could 0.505766726467 -0.304640132821 0.015043924871 -0.42831149929 0.13475950648 0.0275223466164 0.154347034425 0.443048319277 0.229038343902 -0.209763506494
think 0.734314690035 -0.15352368041 0.383964369466 -0.283262375383 0.000534210123265 0.0452656078196 0.0174349360274 -0.0210130687293 0.0247592836651 0.0930452272721
made 0.546164908581 -0.148512160184 0.301391306124 -0.553970562504 -0.0423941756245 -0.0789194920559 -0.0336542251386 0.00929984630184 -0.030340761377 -0.112650323493
way 0.751616772605 -0.345057880564 0.10091886809 -0.147689086912 -0.0721519520719 -0.246317313253 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062
0.34567 yes 0.9876 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062

Match the last word and delete the entire line

Input.txt File
12626232 : Bookmarks
1321121:
126262
Here 126262: can be anything text or digit, so basically will search for last word is : (colon) and delete the entire line
Output.txt File
12626232 : Bookmarks
My Code:
def function_example():
fn = 'input.txt'
f = open(fn)
output = []
for line in f:
if not ":" in line:
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
Problem: When I match with : it remove the entire line, but I just want to check if it is exist in the end of line and if it is end of the line then only remove the entire line.
Any suggestion will be appreciated. Thanks.
I saw as following but not sure how to use it in here
a = "abc here we go:"
print a[:-1]
I believe with this you should be able to achieve what you want.
with open(fname) as f:
lines = f.readlines()
for line in lines:
if not line.strip().endswith(':'):
print line
Here fname is the variable pointing to the file location.
You were almost there with your function. You were checking if : appears anywhere in the line, when you need to check if the line ends with it:
def function_example():
fn = 'input.txt'
f = open(fn)
output = []
for line in f:
if not line.strip().endswith(":"): # This is what you were missing
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
You could have also done if not line.strip()[:-1] == ':':, but endswith() is better suited for your use case.
Here is a compact way to do what you are doing above:
def function_example(infile, outfile, limiter=':'):
''' Filters all lines in :infile: that end in :limiter:
and writes the remaining lines to :outfile: '''
with open(infile) as in, open(outfile,'w') as out:
for line in in:
if not line.strip().endswith(limiter):
out.write(line)
The with statement creates a context and automatically closes files when the block ends.
To search if the last letter is : Do following
if line.strip().endswith(':'):
...Do Something...
You can use a regular expression
import re
#Something end with ':'
regex = re.compile('.(:+)')
new_lines = []
file_name = "path_to_file"
with open(file_name) as _file:
lines = _file.readlines()
new_lines = [line for line in lines if regex.search(line.strip())]
with open(file_name, "w") as _file:
_file.writelines(new_lines)

Categories

Resources