I am working on a program that parses through log files and returns the top hits for IP addresses and a couple other things. Currently I am having trouble and I cannot interpret any of the answers to this problem to what I have going on right now. This is all of my code:
import gzip
from collections import Counter
logFileName = open('C:\\Users\\Pawlaczykm\\Desktop\\fileNames.txt', 'r')
ipAdd = []
landingPages = []
ALL_ipAdd = []
ALL_landingPages = []
# everything after this line gets done to all files
for line in logFileName.readlines():
# rstrip removes a blank line from output
# print 'Summary of: ' + line.rstrip()
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the ip addresses in lines 15-18
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
ipAdd.append(parts[2])
ALL_ipAdd.append(ipAdd)
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the landing pages
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
variable = parts[8].split('?')[0]
landingPages.append(variable)
v): (-v, k))[:10]
ALL_landingPages.append(landingPages)
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
sortedALL_ipAdd = sorted(ALL_ipAddDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top IPs of all files'
print(sortedALL_ipAdd)
ALL_LandingPageDict = dict(Counter(ALL_landingPages).most_common())
sortedALL_LandingPage = sorted(ALL_LandingPageDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top landing pages of all files'
print (sortedALL_LandingPage)
Now where I am having trouble is in the following line:
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
The output when I run the whole program is this:
Traceback (most recent call last):
File "C:/Users/Pawlaczykm/PycharmProjects/LogParse/parseText.py", line 35, in <module>
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
File "C:\Python27\lib\collections.py", line 477, in __init__
self.update(*args, **kwds)
File "C:\Python27\lib\collections.py", line 567, in update
self[elem] = self_get(elem, 0) + 1
TypeError: unhashable type: 'list'
Can somebody help me? This is frustrating.
From your code ALL_ipAdd = [] and ipAdd = [] and ALL_ipAdd.append(ipAdd) we can conclude that ALL_ipAdd is a list of list. Counter is a subtype of dict, which hashes its items before it counts them. Lists cannot be hashed because they are mutable (if the list changed the hash would change) and thus lists can't be counted by Counter objects.
To solve this you can convert the inner lists to tuples before counting them:
ALL_ipAddDict = dict(Counter(map(tuple, ALL_ipAdd)).most_common())
That's normal. ALL_ipAdd is a list of lists. Counter needs a list, a string or any other hashable type :)
Related
I'm trying to run the below python script (vcf2treemix.py) with the command
<./vcf2treemix.py -vcf allsamples14_filtered_1_autosomes38_bisnps.vcf.gz -pop allsamples14.clust.pop>
I got this error with both python 2 and 3
######### error ###
Traceback (most recent call last):
File "./vcf2treemix.py", line 99, in <module>
main()
File "./vcf2treemix.py", line 95, in main
pop_obj = get_pops(pop_file)
File "./vcf2treemix.py", line 34, in get_pops
pops[fields[0]] = fields[1].split()
IndexError: list index out of range
######### vcf2treemix.py ###
#!/usr/bin/python
# vcf2treemix.py
# Converts a vcf file into TreeMix input
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Parsing statistical output of"
" VCFtools")
parser.add_argument("-vcf", dest="vcf_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14_filtered_1_autosomes38_bisnps_main.vcf.gz",
required=True)
parser.add_argument("-pop", dest="pop_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14.clust.pop",
required=True)
arg = parser.parse_args()
def get_pops(pop_file):
"""
Returns a dictionary with pop identifier as key and taxa as a list of
strings. In the pop file, each populations should be in one line, starting
withe pop name, a colon and the corresponding taxa separated by whitespace.
E.g.:
pop1: taxon1 taxon2 taxon3
"""
pops = OrderedDict()
with open(pop_file) as fh:
for line in fh:
fields = line.strip().split(":")
pops[fields[0]] = fields[1].split()
return pops
def vcf2treemix(vcf_file, pop_obj):
"""
Converts a vcf file into treemix format.
"""
vcf_fh = open(vcf_file)
output_name = vcf_file.strip(".vcf") + ".tmix"
output_fh = open(output_name, "w")
# Write header for tmix file
output_fh.write("{}\n".format(" ".join([x for x in pop_obj.keys()])))
for line in vcf_fh:
# Skip header
if line.startswith("##"):
pass
# Get taxon positions
elif line.startswith("#CHROM"):
taxa_pos = line.strip().split()
# Ignore empty lines
elif line.strip() != "":
fields = line.strip().split()
# Ignore loci with more than two alleles
if len(fields[4]) > 1:
continue
# Get allele counts for each populations
temp_pop = OrderedDict((x, [0,0]) for x in pop_obj.keys())
for pop, taxa in pop_obj.items():
for taxon in taxa:
# Get taxon genotype
gen = fields[taxa_pos.index(taxon)]
# Skip if gen is missing data
if gen == "./.":
continue
temp_pop[pop][0] += gen.count("0")
temp_pop[pop][1] += gen.count("1")
# Write current locus to file
output_fh.write("{}\n".format(" ".join([str(x[0]) + "," + str(x[1]) for x in temp_pop.values()])))
vcf_fh.close()
output_fh.close()
def main():
# Args
vcf_file = arg.vcf_file
pop_file = arg.pop_file
pop_obj = get_pops(pop_file)
vcf2treemix(vcf_file, pop_obj)
main()
I have zero experience with python and I just run the script to manipulate genetic data.
Any help will be highly appreciable.
Thanks
Ali
I tried python 2 and 3 and I expect the script to work straightforward. I think there is no problem with the input data.
I have to use a text file and extract the most frequent ip address and count how many times they come up
def anaylse_log(parameter):
myfile = open("sample_log_1 test.txt", "r")
iPdata = myfile.readlines()
mydict = {}
ipAddress = []
item_list = []
result_file = []
counter = ()
def extract_log(myfile):
#split the file line by line
for line in myfile:
splitData = line.split()
ipAddress = splitData[0]
numbers = splitData[1]
ipAddress.append(ipAddress)
numbers.append(numbers)
if numbers in mydict:
#if numbers is already a key in the dictionary
#increase the count
mydict[numbers] += 1
else:
# Otherwise if it's not yet in the dictionary
# Initialise it to 1
mydict[numbers] = 1
return numbers
myfile.close()
def find_most_frequent(maximum,iPdata):
with open("sample_log_1 text", "r") as myfile:
for text in myfile:
if str(maximum) in text:
return maximum
with open("resultss.csv", "w") as file:
file.write(maximum(maximum))
#This will put the dictionary into tuples and give each key a value
item_list = [(k, v) for k, v in mydict.items()]
#This will sort the list by v
item_list.sort(key=lambda x:x[1], reverse=True)
maximum = mydict()
def main(myfile,mydict,iPdata):
result_file = open("resultss.csv", "w")
main()
i had to fix the spacing for the code to be edited, i hope this is ok and you are able to run it, i have stuck on this for a while and i thought i was calling the functions too
Suppose your log file is like
15.25.7.3
25.25.2.5
25.25.2.5
115.25.7.3
215.25.7.3
25.25.2.5
Here is a simple way to count ips
ip_count_dict = {}
with open('ip.log', 'r') as f:
ip_file = f.read()
# if separated by coma
# ip_list = ip_file.split(',')
# if separated by \n new line
ip_list = ip_file.splitlines()
for ip in ip_list:
ip = ip.strip()
if ip in ip_count_dict:
ip_count_dict[ip] += 1
else:
ip_count_dict[ip] = 1
print(ip_count_dict)
Output: {'15.25.7.3': 1, '25.25.2.5': 3, '115.25.7.3': 1, '215.25.7.3': 1}
Instead of manually counting IPs as you loop through your log, try this:
from collections import Counter
log_entries = open("resultss.csv").read().split("\n")
ip_list = [log.split(",")[0] for log in log_entries]
counts = Counter(ip_list)
print(counts)
This works with a CSV file format like:
10.10.10.1,asdf,31
5.9.7.11,aajbczxz,54
5.9.7.11,zzzzz,2
have a Dict with multiple values in a tuple.
newhost = {'newhost.com': ('1.oldhost.com',
'2.oldhost.com',
'3.oldhost.com',
'4.oldhost.com')
}
I wanna open a existing file and search for lines in this file that contains a value of the oldhosts. A file can have multiple Account Lines. In example
Account: 1.oldhost.com username
Account: someotherhost username
When the line with 1.oldhost.com or 2.oldhost.com or 3.oldhost.com and so on is found i wanna replace it with the key form the dict newhost.com.
Can anyone help me? Searched alot, but didnt find the right thing.
Regards
Maybe something like this could get you started
infile_name = 'some_file.txt'
# Open and read the incoming file
with open(infile_name, 'r') as infile:
text = infile.read()
# Cycle through the dictionary
for newhost, oldhost_list in host_dict.items():
# Cycle through each possible old host
for oldhost in oldhost_list:
text.replace(oldhost, newhost)
outfile_name = 'some_other_file.txt'
# Write to file
with open(outfile_name, 'w') as outfile:
outfile.write(text)
Not claiming this to be the best solution, but it should be a good start for you.
To easily find the new host for a given old host, you should convert your data structure:
# your current structure
new_hosts = {
'newhost-E.com': (
'1.oldhost-E.com',
'2.oldhost-E.com',
),
'newhost-A.com': (
'1.oldhost-A.com',
'2.oldhost-A.com',
'3.oldhost-A.com',
),
}
# my proposal
new_hosts_2 = {
v: k
for k, v_list in new_hosts.items()
for v in v_list}
print(new_hosts_2)
# {
# '1.oldhost-E.com': 'newhost-E.com',
# '2.oldhost-E.com': 'newhost-E.com',
# '1.oldhost-A.com': 'newhost-A.com',
# '2.oldhost-A.com': 'newhost-A.com',
# '3.oldhost-A.com': 'newhost-A.com',
# }
This does repeat the new host names (the values in new_hosts_2), but it will allow you to quickly look up given an old host name:
some_old_host = 'x.oldhost.com'
the corresponding_new_host = new_hosts_2[some_old_host]
Now you just need to:
read the lines of the file
find the old hostname in that line
lookup the corresponding new host in new_hosts_2
replace that value in the line
write the line to a new file
Maybe like this:
with open(file_name_1, 'r') as fr:
with open(file_name_2, 'w') as fw:
for line in fr:
line = line.strip()
if len(line) > 0:
# logic to find the start and end position of the old host
start_i = ?
end_i = ?
# get and replace, but only if its found in 'new_hosts_2'
old_host = line[start_i:end_i]
if old_host in new_hosts_2:
line = line[:start_i] + new_hosts_2[old_host] + line[end_i:]
fw.write(line + '\n')
Thank you for your tips. I came up with this now and it is working fine.
import fileinput
textfile = 'somefile.txt'
curhost = 'newhost.com'
hostlist = {curhost: ('1.oldhost.com',
'2.oldhost.com',
'3.oldhost.com')
}
new_hosts_2 = {
v: k
for k, v_list in hostlist.items()
for v in v_list}
for line in fileinput.input(textfile, inplace=True):
line = line.rstrip()
if not line:
continue
for f_key, f_value in new_hosts_2.items():
if f_key in line:
line = line.replace(f_key, f_value)
print line
I have a list, abbreviations, filled with string objects. I am trying to call the .index of a string in my list. When I call the .index method with a string I get a ValueError: 'LING' is not in list, when it clearly is in the list.
My code:
for item in abbreviations:
print item
print abbreviations.index("LING")
Why does 'LING' not exist when it clearing does? I have added my following lines of code, which searches 'abbreviations' for the index of a string. I am baffled -- "LING" is clearly in my abbreviations list.
EDIT (Additional Code):
import csv
myfile = open("/Users/it/Desktop/Classbook/classAbrevs.csv", "rU")
lines = [tuple(row) for row in csv.reader(myfile)]
longSubjectNames = []
abbreviations = []
masterAbrevs = []
for item in lines:
longSubjectNames.append(item[0])
abbreviations.append(item[1])
with open ("/Users/it/Desktop/Classbook/masterClassList.txt", "r") as myfile:
masterSchedule = tuple(open("/Users/it/Desktop/Classbook/masterClassList.txt", 'r'))
for masterline in masterSchedule:
masterline.strip()
masterSplitLine = masterline.split("|")
subjectAbrev = ""
if masterSplitLine[0] != "STATUS":
subjectAbrev = ''.join([i for i in masterSplitLine[2] if not i.isdigit()])
masterAbrevs.append(subjectAbrev)
finalAbrevs = []
for subject in masterAbrevs:
if (subject[-1] == 'W') and (subject[-2:] != 'UW'):
subject = subject[:-1]
finalAbrevs.append(subject)
x = 0
for item in abbreviations:
print item
print abbreviations.index("LING")
for item in finalAbrevs:
if masterSplitLine[0] != "STATUS":
concat = abbreviations.index(str(finalAbrevs[x]).strip())
print "The abbreviation for " + str(item) + " is: " + longSubjectNames[concat]
x = x + 1
The output of:
masterAbrevs = []
for item in lines:
longSubjectNames.append(item[0])
abbreviations.append(item[1])
print '-'.join(abbreviations)
is:
ACA-ACCY-AFST-AMST-ANAT-ANTH-APSC-ARAB-AH-FA-ASTR-BIOC-BISC-BME-BMSC-BIOS-BADM-CHEM-CHIN-CE-CLAS-CCAS-COMM-CSCI-CFA-CNSL-CPED-DNSC-EALL-ECON-EDUC-ECE-EHS-ENGL-EAP-EMSE-ENRP-EPID-EXSC-FILM-FINA-FORS-FREN-GEOG-GEOL-GER-GREK-HCS-HSCI-HLWL-HSML-HEBR-HIST-HOMP-HONR-HDEV-HOL-HSSJ-ISTM-IDIS-IAD-INTD-IAFF-IBUS-ITAL-JAPN-JSTD-KOR-LATN-LAW-LSPA-LING -MGT-MKTG-MBAD-MATH-MAE-MED-MICR-MMED-MSTD-MUS-NSC-ORSC-PSTD-PERS-PHAR-PHIL-PT-PA-PHYS-PMGT-PPSY-PSC-PORT-PSMB-PSYD-PSYC-PUBH-PPPA-REL-SEAS-SMPA-SLAV-SOC-SPAN-SPED-SPHR-STAT-SMPP-SUST-TRDA-TSTD-TURK-UW-WLP-WSTU
Traceback (most recent call last):
File "/Users/it/Desktop/Classbook/sortClasses.py", line 25, in <module>
with open ("/Users/it/Desktop/Classbook/masterClassList.txt", "r") as anything:
IOError: [Errno 2] No such file or directory: '/Users/it/Desktop/Classbook/masterClassList.txt'
myfile = open("/Users/it/Desktop/Classbook/classAbrevs.csv", "rU")
lines = [tuple(row) for row in csv.reader(myfile)]
longSubjectNames = []
abbreviations = []
masterAbrevs = []
for item in lines:
longSubjectNames.append(item[0])
abbreviations.append(item[1])
with open ("/Users/it/Desktop/Classbook/masterClassList.txt", "r") as myfile:
The problem is here;
with open ("/Users/it/Desktop/Classbook/masterClassList.txt", "r") as myfile:
You defined myfile before here,
myfile = open("/Users/it/Desktop/Classbook/classAbrevs.csv", "rU")
So actually abbreviations = [] is not taking data from classAbrevs.csv.Because it's taking data from masterClassList.txt as you defined myfile with this line;
with open ("/Users/it/Desktop/Classbook/masterClassList.txt", "r") as myfile
That's why your string not in that list.Also this line;
for item in lines:
longSubjectNames.append(item[0])
abbreviations.append(item[1])
Are you sure is item[1] has all of the strings that you want?
And I tried these codes I just copy-pasted it from your's and here is the result;
The problem is, from the result you ran:
"LING\t" is shown in your list, not "LING"
with running this I get the desired index:
abbreviations.index("LING\t")
71
To correct this, there are many methods to strip the \t, I'm showing one of those:
abbreviations.append(item[1].strip())
By correcting this line, your item[1] will strip the \t before appending to your abbreviations list.
For example, if I have some text / log file with very simple structure, where here is a few different parts of it, with different structure, and splitted by some mark line, e.g.:
0x23499 0x234234 0x234234
...
0x34534 0x353454 0x345464
$$$NEW_SECTION$$$
4345-34534-345-345345-3453
3453-34534-346-766788-3534
...
So, how I can read file by these parts? E.g. read file in one variable before that $$$NEW_SECTION$$$ mark, and after it (without using regexps, etc). Are here any simple solutions for that?
Here is the solution without reading the whole file into memory:
data1 = []
pos = 0
with open('data.txt', 'r') as f:
line = f.readline()
while line and not line.startswith('$$$'):
data1.append(line)
line = f.readline()
pos = f.tell()
data2 = []
with open('data.txt', 'r') as f:
f.seek(pos)
for line in f:
data2.append(line)
print data1
print data2
The first iteration can't be made with for line in f not to spoil the accurate position in the file.
The simplest solution is str.split
>>> s = filecontents.split("$$$NEW_SECTION$$$")
>>> s[0]
'0x23499 0x234234 0x234234\n\n0x34534 0x353454 0x345464\n'
>>> s[1]
'\n4345-34534-345-345345-3453\n3453-34534-346-766788-3534'
Solution 1:
If file is not very-big then:
with open('your_log.txt') as f:
parts = f.read().split('$$$NEW_SECTION$$$')
if len(parts) > 0:
part1 = parts[0]
...
Solution 2:
def FileParser(filepath):
with open(filepath) as f:
part = ''
while(line = f.readline()):
part += line
if (line != '$$$NEW_SECTION$$$'):
returnpart = part
part = ''
yield returnpart
for segment in FileParser('your_log.txt'):
print segment
Note: it is untested code so please validate before using it
Solution:
def sec(file_, sentinel):
with open(file_) as f:
section = []
for i in iter(f.readline, ''):
if i.rstrip() == sentinel:
yield section
section = []
else:
section.append(i)
yield section
and use:
>>> from pprint import pprint
>>> pprint(list(sec('file.txt')))
[['0x23499 0x234234 0x234234\n', '0x34534 0x353454 0x345464\n'],
['4345-34534-345-345345-3453\n',
'3453-34534-346-766788-3534\n',
'3453-34534-346-746788-3534\n']]
>>>
sections to variables or best sections to dict:
>>> sections = {}
>>> for n, section in enumerate(sec('file.txt')):
... sections[n] = section
>>>