import os
def find_method(name):
i = 0
found_dic = { "$_GET":[], "$_POST":[], "include":[], "require":[], "mysql_query":[], "SELECT":[], "system":[], "exec":[], "passthru":[], "readfile":[], "fopen":[], "eval":[] }
for x in file(name, "r"):
i += 1
for key in found_dic:
if x.strip().find(key) != -1:
found_dic[key].append("LINE:"+str(i)+":" + x.strip())
print "="*20, name, "="*20
for key in found_dic:
if found_dic[key]:
print " ", "-"*10, key, "-"*10
for r in found_dic[key]:
print " ",r
def search(dirname):
flist = os.listdir(dirname)
for f in flist:
next = os.path.join(dirname, f)
if os.path.isdir(next):
search(next)
else:
doFileWork(next)
def doFileWork(filename):
ext = os.path.splitext(filename)[-1]
#if ext == '.html': print filename
if ext == '.php':
# print "target:" + filename
find_method(filename)
how can I print only results. its prints all name of file eventhough file doesn't have any result in it. I want to make print file name if its has any result in it
this is about searching word, but it shows every word include like (seaching for include) then it also finds word in sentence and prints all sentence I want to find only word "include" not included in sentence. it's really hard to explain.. I hope to understand.. srry
It looks like there may be a problem with the indentation of the first print command, you are printing 'name', but it is outside of the for loop.
Try populating your dictionary, and then printing the dictionary, along the lines of:
with open(your_file) as f:
found_dic = {}
key = 'your_key'
# populate the dictionary
found_dic[key] = [i for i in f if key in i and i not in found_dic]
With this as a starting point, hopefully you can format the result to the dictionary as you need it. Only lines that include the 'key' will be in the found_dic, so you should be able to print these out in any format you like.
Hope this helps
I hope that's what you asked for:
for i, line in enumerate(file(name, "r")):
found = False
for key in found_dic:
if key in line.strip():
found_dic[key].append("LINE:"+str(i)+":" + key)
found = True
if found:
print "="*20, name, "="*20
for key in found_dic:
if found_dic[key]:
print " ", "-"*10, key, "-"*10
for r in found_dic[key]:
print " ",r
You have to check if you found something if you only want to print the name when you actually found something. Also, you only concatenate key in line 5, because key is what you search. And you only want to add what you search.
Further changes:
I used the enumerate function in line i, its far easier and more readable than incrementing you own i.
I also changed the condition in line 10. Using the in keyword here is the more simple and readable way...
Related
I have a problem, I have a dictionary and i need to pass a function to each value of the dictionary, I need my output to include the value key + the result of the function.
The program itself should identify open reading frames in a given DNA sequences, I have this part working but i need to print the name of sequence + the open reading frames.
Im new to programming and i got stuck. All help will be greatly appreciated.
#converts a fasta file into a dictionary.
import re
myfile = input("Enter a file name and directory:")
try:
f=open(myfile)
except IOError:
print("File doesn't exist!")
seqs = {}
for line in f:
line=line.rstrip()
#Gets rid of triling empy spaces.
if line[0]=='>':
words=line.split()
name=words[0][1:]
seqs[name] = ''
else:
seqs[name] = seqs[name] + line
print("Number of entries:",len(seqs.keys()))
length_seqs = {key:len(seq)for key, seq in seqs.items()}
sorted_length_seqs = sorted(length_seqs.items(), key=lambda kv:kv[1])
print("Entries by length:",sorted_length_seqs)
#finds the ORF in the dictionary sequences.
def find_ORFs(DNA):
ORFs = []
if 'ATG' in DNA:
for startMatch in re.finditer('ATG',DNA):
remaining = DNA[startMatch.start():]
for stopMatch in re.finditer('TAA|TGA|TAG',remaining):
substring = remaining[:stopMatch.end()]
if len(substring) % 3 == 0:
ORFs.append(substring)
break
else:
print("There are no ORFs in your sequence")
ORFs.sort(key=len, reverse=True)
print(ORFs)
for ORF in ORFs:
print(ORF,'ORF lenght',len(ORF))
#passes the function to the dictionary values.
#Here i need to pass the function to each of the values of the dictionary but i cant manage to make it print the the dictionary key of each value.
for seq in seqs:
DNA = seqs[seq]
ORF = find_ORFs(DNA)
You could pass the name of the sequence to the function as well:
def find_ORFs(name, DNA):
ORFs = []
if 'ATG' in DNA:
for startMatch in re.finditer('ATG',DNA):
remaining = DNA[startMatch.start():]
for stopMatch in re.finditer('TAA|TGA|TAG',remaining):
substring = remaining[:stopMatch.end()]
if len(substring) % 3 == 0:
ORFs.append(substring)
break
else:
print("There are no ORFs in your sequence: {}".format(name))
ORFs.sort(key=len, reverse=True)
print(ORFs)
for ORF in ORFs:
print("{}, {}, {}".format(name, len(ORF), ORF))
for name, DNA in seqs.items():
ORF = find_ORFs(name, DNA)
You should also take a look at some external libraries that can help you with parsing FASTA files, such as SeqIO from BioPython.
New to Python and I'm trying to count the words in a directory of text files and write the output to a separate text file. However, I want to specify conditions. So if word count is > 0 is would like to write the count and file path to one file and if the count is == 0. I would like to write the count and file path to a separate file. Below is my code so far. I think I'm close, but I'm hung up on how to do the conditions and separate files. Thanks.
import sys
import os
from collections import Counter
import glob
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join("path", '*.txt')):
with open(filepath) as f:
data = f.read()
for key,val in words.items():
#print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key,val in sorted(words.items()):
print(filepath)
if val > 0:
print('{0}:\t{1}'.format(
key,
val))
filepath = sys.argv[1]
keys = ["x", "y"]
words = dict.fromkeys(keys,0)
count_words_in_dir(filepath, words, action=print_summary)
sys.stdout.close()
sys.stdout=stdoutOrigin
I would strongly urge you to not repurpose stdout for writing data to a file as part of the normal course of your program. I also wonder how you can ever have a word "count < 0". I assume you meant "count == 0".
The main problem that your code has is in this line:
for filepath in glob.iglob(os.path.join("path", '*.txt')):
The string constant "path" I'm pretty sure doesn't belong there. I think you want filepath there instead. I would think that this problem would prevent your code from working at all.
Here's a version of your code where I fixed these issues and added the logic to write to two different output files based on the count:
import sys
import os
import glob
out1 = open("/tmp/so/seen.txt", "w")
out2 = open("/tmp/so/missing.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
with open(filepath) as f:
data = f.read()
for key, val in words.items():
# print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key, val in sorted(words.items()):
whichout = out1 if val > 0 else out2
print(filepath, file=whichout)
print('{0}: {1}'.format(key, val), file=whichout)
filepath = sys.argv[1]
keys = ["country", "friend", "turnip"]
words = dict.fromkeys(keys, 0)
count_words_in_dir(filepath, words, action=print_summary)
out1.close()
out2.close()
Result:
file seen.txt:
/Users/steve/tmp/so/dir/data2.txt
friend: 1
/Users/steve/tmp/so/dir/data.txt
country: 2
/Users/steve/tmp/so/dir/data.txt
friend: 1
file missing.txt:
/Users/steve/tmp/so/dir/data2.txt
country: 0
/Users/steve/tmp/so/dir/data2.txt
turnip: 0
/Users/steve/tmp/so/dir/data.txt
turnip: 0
(excuse me for using some search words that were a bit more interesting than yours)
Hello I hope I understood your question correctly, this code will count how many different words are in your file and depending on the conditions will do something you want.
import os
all_words = {}
def count(file_path):
with open(file_path, "r") as f:
# for better performance it is a good idea to go line by line through file
for line in f:
# singles out all the words, by splitting string around spaces
words = line.split(" ")
# and checks if word already exists in all_words dictionary...
for word in words:
try:
# ...if it does increment number of repetitions
all_words[word.replace(",", "").replace(".", "").lower()] += 1
except Exception:
# ...if it doesn't create it and give it number of repetitions 1
all_words[word.replace(",", "").replace(".", "").lower()] = 1
if __name__ == '__main__':
# for every text file in your current directory count how many words it has
for file in os.listdir("."):
if file.endswith(".txt"):
all_words = {}
count(file)
n = len(all_words)
# depending on the number of words do something
if n > 0:
with open("count1.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
else:
with open("count2.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
if you want to count same word multiple times you can add up all values from dictionary or you can eliminate try-except block and count every word there.
So i have am trying to go through a text file, put it into a dictionary, and then check to see if a string is already in it. if it is, i want to change the "1" to a "2". Currently if the string is already in the text file, it will just make a new line but with a "2". is there a way to edit the text file so the number can stay in the same place but be replaced?
class Isduplicate:
dicto = {}
def read(self):
f = open(r'C:\Users\jacka\OneDrive\Documents\outputs.txt', "r")
for line in f:
k, v = line.strip().split(':')
self.dicto[k.strip()] = int(v.strip())
return self.dicto
Is = Isduplicate()
while counter < 50:
e = str(elem[counter].get_attribute("href"))
e = e.replace("https://www.reddit.com/r/", "")
e = e[:-1]
if e in Is.read():
Is.dicto[e] += 1
else:
Is.dicto[e] = 1
text_file.write(e + ":" + str(Is.dicto[e]) + '\n')
print(e)
counter = counter +2
You can not rewrite a particular byte in a file, you have to rewrite the file in a whole.
Probably reading the file into the list of strings, processing that list and writing it back to the file would solve your task.
#
# Obtain user input for file name, and open it
#
inFile = open(input("Enter file name: "), "r")
#
# Process data and address possible errors
#
countDinner = 0
countLodging = 0
countConference = 0
valueDinner = 0
valueLodging = 0
valueConference = 0
done = False
while not done :
line = inFile.readline()
try :
s = line
serviceAmount = ';'.join(s.split(';')[1:-1]) #Removes date and name regardless of format
serviceAmount.split(";")
s.lower()
if "dinner" in s :
countDinner = countDinner + 1
valueDinner = valueDinner + int(filter(str.isdigit, s))
print("Dinners: ", countDinner, "Value of Dinner sales: ", valueDinner)
elif "lodging" in s :
countLodging = countLodging + 1
valueLodging = valueLodging + int(filter(str.isdigit, s))
print("Lodging: ", countLodging, "Value of Lodging sales: ", valueLodging)
elif "conference" in s :
countConference = countConference + 1
valueConference = valueConference + int(filter(str.isdigit, s))
print("Conferences: ", countConference, "Value of Conference sales: ", valueConference)
elif line == "" :
done = True
else :
print("Invalid file format.")
except FileNotFoundError :
print("Unable to find file.")
finally :
done = True
inFile.close()
Returns "Invalid file format" even when the document is set up specifically for this code. I'm not getting a syntax error, so I'm not sure whats wrong.
The document contains the text:
John;Lodging;123;050617
Tyler;Conference;123;081497
Taylor;Dinner;453;041798
There are a lot of things you aren't doing quite right here. I tried to not only fix the issue you posted about, but also write some code that should be more clear and easier to use. I left comments to explain things.
# Don't open the file here, just get the file name. We will open in later
fname = input("Enter file name: ")
# I think using dicts is more clearn and organized. Having so many variables I think makes the code messy
counts = {"Dinner": 0,
"Lodging": 0,
"Conference": 0}
values = {"Dinner": 0,
"Lodging": 0,
"Conference": 0}
# Lets try to open the file
try:
with open(fname, 'r') as inFile: # Use "with", this way the file is closed automatically when we are done reading it
for linenum, line in enumerate(inFile): # I want to enumerate each line. If there is an error on a line, we can display the line nmber this way
line = line.lower().split(';')[1:-1] # lets make it all lower case, then split and drop as needed
print(line)
if "dinner" in line :
counts["Dinner"] += 1 # x += 1 is the same as x = x + 1, but cleaner
values["Dinner"] += int(line[1])
print("Dinners: {} Value of Dinner sales: {}".format(counts["Dinner"], values["Dinner"]))
elif "lodging" in line :
counts["Lodging"] += 1
values["Lodging"] += int(line[1])
print("Lodging: {} Value of Dinner sales: {}".format(counts["Lodging"], values["Lodging"]))
elif "conference" in line :
counts["Conference"] += 1
values["Conference"] += int(line[1])
print("Conference: {} Value of Dinner sales: {}".format(counts["Conference"], values["Conference"]))
else :
print("Invalid file format on line {}".format(linenum)) # Here is why we used enumerate in the for loop
except FileNotFoundError:
print("Unable to find file.")
Here is your problem:
serviceAmount = ';'.join(s.split(';')[1:-1]) #Removes date and name regardless of format
serviceAmount.split(";")
You should do:
serviceAmount = ';'.join(s.lower().split(';')[1:-1])
You are checking against lower case strings, but not actually lower casing your input.
It is also important to note that s.lower() doesn't actually change s, it just returns a string where all the letters of s have been switched to lower case. Same thing for split (as in not changing the string its called on, not that it returns a string).
Another problem you are going to run into is getting the numbers from your strings.
int(filter(str.isdigit, s))
Won't work. You can use split again like you did earlier (or just not re-join since you only care about the first element in the comparisons).
int(serviceAmount.split(';')[1])
The last thing is the
finally:
done = True
inFile.close()
finally always runs when exiting a try, meaning that you are always done after each loop (and close the file after you read the first line).
If you remove the finally and add inFile.close() inside the elif line == "" it will close, and set done only when you've reached the end of the file.
It could be done as simple as
categories = {}
filename = input("Enter file name: ")
with open(filename, "r") as file:
name, category, value, date = file.readline().split(";")
if category not in categories:
categories[category] = {"count": 0, "value": 0}
categories[category]["count"] += 1
categories[category]["value"] += int(value)
At the end, you'll have a dict with categories, their count, and value, and also their names are not hard-coded.
I am working on a project that requires me to be able to search for multiple keywords in a file. For example, if I had a file with 100 occurrences of the word "Tomato", 500 for the word "Bread", and 20 for "Pickle", I would want to be able to search the file for "Tomato" and "Bread" and get the number of times it occurs in the file. I was able to find people with the same issue/question, but for other languages on this site.
I a working program that allows me to search for the column name and tally how many times something shows up in that column, but I want to make something a bit more precise. Here is my code:
def start():
location = raw_input("What is the folder containing the data you like processed located? ")
#location = "C:/Code/Samples/Dates/2015-06-07/Large-Scale Data Parsing/Data Files"
if os.path.exists(location) == True: #Tests to see if user entered a valid path
file_extension = raw_input("What is the file type (.txt for example)? ")
search_for(location,file_extension)
else:
print "I'm sorry, but the file location you have entered does not exist. Please try again."
start()
def search_for(location,file_extension):
querylist = []
n = 5
while n == 5:
search_query = raw_input("What would you like to search for in each file? Use'Done' to indicate that you have finished your request. ")
#list = ["CD90-N5722-15C", "CD90-NB810-4C", "CP90-N2475-8", "CD90-VN530-22B"]
if search_query == "Done":
print "Your queries are:",querylist
print ""
content = os.listdir(location)
run(content,file_extension,location,querylist)
n = 0
else:
querylist.append(search_query)
continue
def run(content,file_extension,location,querylist):
for item in content:
if item.endswith(file_extension):
search(location,item,querylist)
quit()
def search(location,item,querylist):
with open(os.path.join(location,item), 'r') as f:
countlist = []
for search in querylist: #any search value after the first one is incorrectly reporting "0"
countsearch = 0
for line in f:
if search in line:
countsearch = countsearch + 1
countlist.append(search)
countlist.append(countsearch) #mechanism to update countsearch is not working for any value after the first
print item, countlist
start()
If I use that code, the last part (def search) is not working correctly. Any time I put a search in, any search after the first one I enter in returns "0", despite there being up to 500,000 occurrences of the search word in a file.
I was also wondering, since I have to index 5 files with 1,000,000 lines each, if there was a way I could write either an additional function or something to count how many times "Lettuce" occurs over all the files.
I cannot post the files here due to their size and content. Any help would be greatly appreciated.
Edit
I also have this piece of code here. If I use this, I get the correct count of each, but it would be much better to have a user be able to enter as many searches as they want:
def check_start():
#location = raw_input("What is the folder containing the data you like processed located? ")
location = "C:/Code/Samples/Dates/2015-06-07/Large-Scale Data Parsing/Data Files"
content = os.listdir(location)
for item in content:
if item.endswith("processed"):
countcol1 = 0
countcol2 = 0
countcol3 = 0
countcol4 = 0
#print os.path.join(currentdir,item)
with open(os.path.join(location,item), 'r') as f:
for line in f:
if "CD90-N5722-15C" in line:
countcol1 = countcol1 + 1
if "CD90-NB810-4C" in line:
countcol2 = countcol2 + 1
if "CP90-N2475-8" in line:
countcol3 = countcol3 + 1
if "CD90-VN530-22B" in line:
countcol4 = countcol4 + 1
print item, "CD90-N5722-15C", countcol1, "CD90-NB810-4C", countcol2, "CP90-N2475-8", countcol3, "CD90-VN530-22B", countcol4
You are trying to iterate over your file more than once. After the first time, the file pointer is at the end so subsequent searches will fail because there's nothing left to read.
If you add the line:
f.seek(0), this will reset the pointer before every read:
def search(location,item,querylist):
with open(os.path.join(location,item), 'r') as f:
countlist = []
for search in querylist: #any search value after the first one is incorrectly reporting "0"
countsearch = 0
for line in f:
if search in line:
countsearch = countsearch + 1
countlist.append(search)
countlist.append(countsearch) #mechanism to update countsearch is not working for any value after the first
f.seek(0)
print item, countlist
PS. I've guessed at the indentation... You really shouldn't use tabs.
I'm not sure I get your question completely, but how about something like this?
def check_start():
raw_search_terms = raw_input('Enter search terms seperated by a comma:')
search_term_list = raw_search_terms.split(',')
#location = raw_input("What is the folder containing the data you like processed located? ")
location = "C:/Code/Samples/Dates/2015-06-07/Large-Scale Data Parsing/Data Files"
content = os.listdir(location)
for item in content:
if item.endswith("processed"):
# create a dictionary of search terms with their counts (initialized to 0)
search_term_count_dict = dict(zip(search_term_list, [0 for s in search_term_list]))
for line in f:
for s in search_term_list:
if s in line:
search_term_count_dict[s] += 1
print item
for key, value in search_term_count_dict.iteritems() :
print key, value