I am trying to parallel process some file by reading chunks and process each chunk by using multiprocessing libraries. Following is my code:
from multiprocessing import Pool
from itertools import islice
import traceback
#Produce key value pairs (Date, Market_Share*Market_Share)
def Map(L):
results = []
for w in L:
temp = w.split(',')
Date = temp[0]
Share = float(temp[1][:-1])
ShareSquare = str(Share*Share)
results.append((Date,ShareSquare))
return results
if __name__=='__main__':
pool = Pool(2)
f = open('C:/Users/Daniel/Desktop/Project/Optiver/atchm_9450.csv','r')
fw = open('C:/Users/Daniel/Desktop/Project/Optiver/marketshare.csv','w')
f.readline()
while True:
next_n_lines = list(islice(f,16))
if not next_n_lines:
break
else:
l = pool.map(Map,next_n_lines)
f.close()
fw.close()
However, it produces index out of range error:
Traceback (most recent call last):
File "trial.py", line 29, in <module>
l = pool.map(Map,next_n_lines)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 558, in get
raise self._value
IndexError: list index out of range
The list object I passed into the Map function is something like ['6/26/2014,68.90\n', '6/27/2014,68.84\n', '6/30/2014,68.80\n'....]
It works correctly when there is no parallelism involved (pool is not invoked).
What possibly causes this behavior?
At first glance, only those two lines can raise this exception:
Date = temp[0]
Share = float(temp[1][:-1])
Try to check that w have enough data.
Related
I'm trying to run the below python script (vcf2treemix.py) with the command
<./vcf2treemix.py -vcf allsamples14_filtered_1_autosomes38_bisnps.vcf.gz -pop allsamples14.clust.pop>
I got this error with both python 2 and 3
######### error ###
Traceback (most recent call last):
File "./vcf2treemix.py", line 99, in <module>
main()
File "./vcf2treemix.py", line 95, in main
pop_obj = get_pops(pop_file)
File "./vcf2treemix.py", line 34, in get_pops
pops[fields[0]] = fields[1].split()
IndexError: list index out of range
######### vcf2treemix.py ###
#!/usr/bin/python
# vcf2treemix.py
# Converts a vcf file into TreeMix input
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Parsing statistical output of"
" VCFtools")
parser.add_argument("-vcf", dest="vcf_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14_filtered_1_autosomes38_bisnps_main.vcf.gz",
required=True)
parser.add_argument("-pop", dest="pop_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14.clust.pop",
required=True)
arg = parser.parse_args()
def get_pops(pop_file):
"""
Returns a dictionary with pop identifier as key and taxa as a list of
strings. In the pop file, each populations should be in one line, starting
withe pop name, a colon and the corresponding taxa separated by whitespace.
E.g.:
pop1: taxon1 taxon2 taxon3
"""
pops = OrderedDict()
with open(pop_file) as fh:
for line in fh:
fields = line.strip().split(":")
pops[fields[0]] = fields[1].split()
return pops
def vcf2treemix(vcf_file, pop_obj):
"""
Converts a vcf file into treemix format.
"""
vcf_fh = open(vcf_file)
output_name = vcf_file.strip(".vcf") + ".tmix"
output_fh = open(output_name, "w")
# Write header for tmix file
output_fh.write("{}\n".format(" ".join([x for x in pop_obj.keys()])))
for line in vcf_fh:
# Skip header
if line.startswith("##"):
pass
# Get taxon positions
elif line.startswith("#CHROM"):
taxa_pos = line.strip().split()
# Ignore empty lines
elif line.strip() != "":
fields = line.strip().split()
# Ignore loci with more than two alleles
if len(fields[4]) > 1:
continue
# Get allele counts for each populations
temp_pop = OrderedDict((x, [0,0]) for x in pop_obj.keys())
for pop, taxa in pop_obj.items():
for taxon in taxa:
# Get taxon genotype
gen = fields[taxa_pos.index(taxon)]
# Skip if gen is missing data
if gen == "./.":
continue
temp_pop[pop][0] += gen.count("0")
temp_pop[pop][1] += gen.count("1")
# Write current locus to file
output_fh.write("{}\n".format(" ".join([str(x[0]) + "," + str(x[1]) for x in temp_pop.values()])))
vcf_fh.close()
output_fh.close()
def main():
# Args
vcf_file = arg.vcf_file
pop_file = arg.pop_file
pop_obj = get_pops(pop_file)
vcf2treemix(vcf_file, pop_obj)
main()
I have zero experience with python and I just run the script to manipulate genetic data.
Any help will be highly appreciable.
Thanks
Ali
I tried python 2 and 3 and I expect the script to work straightforward. I think there is no problem with the input data.
Error:
Traceback (most recent call last):
File "son.py", line 120, in <module>
`main()`
File "son.py", line 101, in main
`temp += item`
TypeError: 'ApplyResult' object is not iterable
Code:
pool = multiprocessing.Pool(processes=int(args.process))
for i in range(int(args.process)):
result_first_round.append(pool.apply_async(Son_Algorithm, (i,)))
pool.close()
pool.join()
first_temp = []
for res in result_first_round:
first_temp.append(res)
#first_temp.append(res.get())
#Second Scan
result_final_round = []
temp = []
for item in first_temp:
temp += item
temp2 = []
for result in temp:
if not result in temp2:
temp2.append(result)
temp_result = temp2
It seems that you want to add the element item to the list temp. In that case you need to use the method append(), like this:
temp = []
for item in first_temp:
temp.append(item)
The operator += for a list only works if the second object is also a list (or at least an iterable).
I am working on a program that parses through log files and returns the top hits for IP addresses and a couple other things. Currently I am having trouble and I cannot interpret any of the answers to this problem to what I have going on right now. This is all of my code:
import gzip
from collections import Counter
logFileName = open('C:\\Users\\Pawlaczykm\\Desktop\\fileNames.txt', 'r')
ipAdd = []
landingPages = []
ALL_ipAdd = []
ALL_landingPages = []
# everything after this line gets done to all files
for line in logFileName.readlines():
# rstrip removes a blank line from output
# print 'Summary of: ' + line.rstrip()
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the ip addresses in lines 15-18
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
ipAdd.append(parts[2])
ALL_ipAdd.append(ipAdd)
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the landing pages
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
variable = parts[8].split('?')[0]
landingPages.append(variable)
v): (-v, k))[:10]
ALL_landingPages.append(landingPages)
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
sortedALL_ipAdd = sorted(ALL_ipAddDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top IPs of all files'
print(sortedALL_ipAdd)
ALL_LandingPageDict = dict(Counter(ALL_landingPages).most_common())
sortedALL_LandingPage = sorted(ALL_LandingPageDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top landing pages of all files'
print (sortedALL_LandingPage)
Now where I am having trouble is in the following line:
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
The output when I run the whole program is this:
Traceback (most recent call last):
File "C:/Users/Pawlaczykm/PycharmProjects/LogParse/parseText.py", line 35, in <module>
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
File "C:\Python27\lib\collections.py", line 477, in __init__
self.update(*args, **kwds)
File "C:\Python27\lib\collections.py", line 567, in update
self[elem] = self_get(elem, 0) + 1
TypeError: unhashable type: 'list'
Can somebody help me? This is frustrating.
From your code ALL_ipAdd = [] and ipAdd = [] and ALL_ipAdd.append(ipAdd) we can conclude that ALL_ipAdd is a list of list. Counter is a subtype of dict, which hashes its items before it counts them. Lists cannot be hashed because they are mutable (if the list changed the hash would change) and thus lists can't be counted by Counter objects.
To solve this you can convert the inner lists to tuples before counting them:
ALL_ipAddDict = dict(Counter(map(tuple, ALL_ipAdd)).most_common())
That's normal. ALL_ipAdd is a list of lists. Counter needs a list, a string or any other hashable type :)
I want to serialize an ndarray/list filled with complex number, the example code is here:
a = [None]
a[0] = 0.006863076166054825+0j
a
[(0.006863076166054825+0j)]
>>> b = json.dumps(a)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\Python27\lib\json\__init__.py", line 243, in dumps
return _default_encoder.encode(obj)
File "D:\Python27\lib\json\encoder.py", line 207, in encode
chunks = self.iterencode(o, _one_shot=True)
File "D:\Python27\lib\json\encoder.py", line 270, in iterencode
return _iterencode(o, 0)
File "D:\Python27\lib\json\encoder.py", line 184, in default
raise TypeError(repr(o) + " is not JSON serializable")
TypeError: (0.006863076166054825+0j) is not JSON serializable
so how to deal with the problem?
The json.dumps(a) will fail, because the function can not handle the complex number when it tries to interpret it. The only possibility to pass the value is as string:
a = [1]
a[0] = "0.006863076166054825+0j"
b = json.dumps(a)
print b
which outputs
["0.006863076166054825+0j"]
ok let me make it clear
I found another way to do it.
use the module pickle
eg:
fp = open("1.txt","w")
a = [1,2,3]
pickle.dump(a,fp,0)
fp.close()
to load is the same:
fp = open("1.txt")
a = pickle.load(fp)
print a
fp.close()
it can serialize any object as long as it can find the class
I needed this problem's solution too. I have written this code and it works for the task that I need, but when I ran it through checks it does not complete all the tasks. Still, one can use it.
# turn complex to str
def replace_complex( inp):
"""Replace complex numbers with strings of
complex number + __ in the beginning.
Parameters:
------------
inp: input dictionary.
"""
try:
if isinstance(inp, complex):
return "__" + str(inp)
elif isinstance(inp, list):
for each in range(len(inp)):
inp[ each] = replace_complex( inp[ each])
return inp
elif isinstance(inp, dict):
for key,val in inp.items():
inp[key] = replace_complex( val)
return inp
else:
return inp # nothing found - better than no checks
except Exception as e:
print(e)
return ""
I am trying to read in large JSON file (data.json) in Python. Because the JSON file has multiple JSON objects, and multiple dictionaries will be created in Python(the number of dictionaries are unknown), I used decoder.raw_decode() and generator.
The following is the code:
import json
import pprint
import io
import pprint
def parse():
with open('data.json',encoding='utf-8') as jfile:
try:
while True:
decoder = json.JSONDecoder()
obj, idx = decoder.raw_decode(jfile)
yield obj
except ValueError as e:
print(e)
pass
else:
print("aha")
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
I get the error:
Traceback (most recent call last):
File "H:\Document\Python\j10.py", line 57, in <module>
main()
File "H:\Document\Python\j10.py", line 36, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j10.py", line 21, in parse
obj, idx = decoder.raw_decode(jfile)
File "C:\Python34\lib\json\decoder.py", line 360, in raw_decode
obj, end = self.scan_once(s, idx)
TypeError: first argument must be a string, not _io.TextIOWrapper
I edited code based on Martijn's answer:
import json
import io
file=open('data.json.txt')
def readin():
return file.read(2000)
def parse():
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(readin, ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
and I get an UnicodeError:
Traceback (most recent call last):
File "H:\Document\Python\j11.py", line 35, in <module>
main()
File "H:\Document\Python\j11.py", line 30, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j11.py", line 14, in parse
for chunk in iter(readin, ''):
File "H:\Document\Python\j11.py", line 8, in readin
return file.read(2000)
File "C:\Python34\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4217: character maps to <undefined>
You are passing in the file object, but decoder.raw_decode() only takes text data. You need to do the reading yourself:
obj, idx = decoder.raw_decode(jfile.read())
You are then yielding Python objects created from the JSON data, so your .readlines() call in your main() function loop will also fail.
You are not using raw_decode() correctly, however. You are yourself responsible for feeding it chunks of text, it'll not read that text from the file for you. If you wanted to handle the file in chunks, and there are no clear delimiters between the JSON entries, you'll be forced to read the file in blocks:
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(partial(jfile.read, buffersize), ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
This will still yield completely decoded objects; if your file is one long JSON object (like one top-level list or dictionary) then this'll not yield the contents of that object one by one; it'll still read the whole object before yielding.