After reading from a file I have a list of lists contaning not only digits but also other characters, which I would like to get rid of.
I've tried using re.sub function but this doesn't seem to work
import re
Poly_id= [['0', '[4', '8', '18', '20', '5', '0', '4]'], ['1', '[13', '16',
'6', '11', '13]'], ['2', '[3', '1', '10', '9', '2', '15', '3]'], ['3',
'[13', '12', '16', '13]'], ['4', '[13', '11', '17', '14', '7', '13]']]
for x in Poly_id:
[re.sub(r'\W', '', ch) for ch in x]
This doesn't seem to change a thing in this list.
I would like to have a list with only numbers as elements so that I could convert them into integers
I guess technically [4 is non numeric so you can do something like this:
Poly_id = [[char for char in _list if str.isnumeric(char)] for _list in Poly_id]
Output:
['0', '8', '18', '20', '5', '0']
['1', '16', '6', '11']
['2', '1', '10', '9', '2', '15']
['3', '12', '16']
['4', '11', '17', '14', '7']
If you just want to remove the non numeric values and not the complete entry then you can do this:
Poly_id = [[''.join(char for char in substring if str.isnumeric(char)) for substring in _list] for _list in Poly_id]
Output:
['0', '4', '8', '18', '20', '5', '0', '4']
['1', '13', '16', '6', '11', '13']
['2', '3', '1', '10', '9', '2', '15', '3']
['3', '13', '12', '16', '13']
['4', '13', '11', '17', '14', '7', '13']
Here a solution if you want to get rid of the '[' in '[4' but keep the '4':
res = [[re.sub(r'\W', '', st) for st in inlist] for inlist in Poly_id]
res is:
[
['0', '4', '8', '18', '20', '5', '0', '4'],
['1', '13', '16', '6', '11', '13'],
['2', '3', '1', '10', '9', '2', '15', '3'],
['3', '13', '12', '16', '13'],
['4', '13', '11', '17', '14', '7', '13']
]
You can use a module, "itertools"
import itertools
list_of_lists = [[1, 2], [3, 4]]
print(list(itertools.chain(*list_of_lists)))
>>>[1, 2, 3, 4]
I'm trying to do some webscraping to download all the results of euromillions, stuck with errors now. I'm using jupyter and python 3 with the modules specified. With just one link the code worked just fine but now I added a loop and some modifications and rip xD
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
years = list(range(2004,2018))
for year in years:
my_urls = ('https://www.euro-millions.com/pt/arquivo-de-resultados-' + str(year),)
my_url = my_urls[0]
for my_url in my_urls:
Client = uReq(my_url)
html = Client.read
Client.close()
euro = soup(html, "html")
containers = euro.findAll("div",{"class":"archives"})
print(containers)
container = containers[0]
for container in containers:
data = container.a["href"].replace('/pt/resultados/','') #Usamos os [] como num dicionario, .strip tbm retira o lixo sometimes
bolasN = container.ul.findAll("li",{"class":"ball"})
bolasS = container.ul.findAll("li",{"class":"lucky-star"})
bola1 = bolasN[0].text
bola2 = bolasN[1].text
bola3 = bolasN[2].text
bola4 = bolasN[3].text
bola5 = bolasN[4].text
star1 = bolasS[0].text
star2 = bolasS[1].text
TUDO = [data, bola1, bola2, bola3, bola4, bola5, star1, star2]
print(TUDO)
TRACEBACK:
TypeError Traceback (most recent call last)
<ipython-input-31-b11e2044b5ea> in <module>
12 html = Client.read
13 Client.close()
---> 14 euro = soup(html, "html")
15 containers = euro.findAll("div",{"class":"archives"})
16 print(containers)
/usr/local/lib/python3.5/dist-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
244 if hasattr(markup, 'read'): # It's a file-type object.
245 markup = markup.read()
--> 246 elif len(markup) <= 256 and (
247 (isinstance(markup, bytes) and not b'<' in markup)
248 or (isinstance(markup, str) and not '<' in markup)
TypeError: object of type 'method' has no len()
The markup was wrong, it should be 'html.parser' or 'lxml'
import requests
from bs4 import BeautifulSoup as soup
years = list(range(2004,2018))
for year in years:
my_urls = ('https://www.euro-millions.com/pt/arquivo-de-resultados-' + str(year),)
my_url = my_urls[0]
for my_url in my_urls:
Client = requests.get(my_url)
euro = soup(Client.content, "html.parser")
containers = euro.findAll("div",{"class":"archives"})
#print(containers)
container = containers[0]
for container in containers:
data = container.a["href"].replace('/pt/resultados/','') #Usamos os [] como num dicionario, .strip tbm retira o lixo sometimes
bolasN = container.ul.findAll("li",{"class":"ball"})
bolasS = container.ul.findAll("li",{"class":"lucky-star"})
bola1 = bolasN[0].text
bola2 = bolasN[1].text
bola3 = bolasN[2].text
bola4 = bolasN[3].text
bola5 = bolasN[4].text
star1 = bolasS[0].text
star2 = bolasS[1].text
TUDO = [data, bola1, bola2, bola3, bola4, bola5, star1, star2]
print(TUDO)
Output:
['29-12-2017', '4', '8', '22', '23', '48', '1', '12']
['26-12-2017', '4', '17', '30', '43', '44', '2', '10']
['22-12-2017', '5', '24', '30', '31', '43', '3', '6']
['19-12-2017', '8', '15', '30', '38', '46', '4', '7']
['15-12-2017', '25', '30', '31', '42', '50', '2', '11']
['12-12-2017', '20', '37', '39', '44', '50', '4', '8']
['08-12-2017', '4', '22', '30', '32', '34', '3', '4']
['05-12-2017', '11', '36', '43', '44', '48', '2', '7']
['01-12-2017', '5', '24', '29', '35', '46', '11', '12']
['28-11-2017', '1', '6', '12', '18', '42', '2', '7']
['24-11-2017', '19', '24', '28', '30', '50', '3', '10']
['21-11-2017', '2', '10', '14', '28', '31', '5', '7']
['17-11-2017', '20', '26', '35', '36', '42', '5', '12']
['14-11-2017', '14', '16', '39', '40', '41', '8', '10']
['10-11-2017', '13', '22', '29', '36', '37', '1', '9']
['07-11-2017', '7', '19', '20', '37', '41', '2', '12']
['03-11-2017', '5', '12', '17', '33', '41', '4', '9']
['31-10-2017', '1', '12', '36', '43', '46', '3', '5']
['27-10-2017', '3', '16', '23', '32', '39', '1', '4']
['24-10-2017', '9', '11', '13', '27', '33', '7', '10']
['20-10-2017', '4', '17', '23', '27', '30', '3', '8']
['17-10-2017', '13', '17', '19', '26', '36', '2', '3']
['13-10-2017', '23', '29', '37', '45', '50', '5', '11']
['10-10-2017', '4', '21', '34', '36', '37', '3', '6']
['06-10-2017', '1', '9', '15', '19', '25', '1', '7']
['03-10-2017', '6', '24', '32', '48', '50', '1', '5']
['29-09-2017', '7', '18', '19', '32', '48', '3', '7']
['26-09-2017', '1', '29', '40', '41', '48', '6', '12']
['22-09-2017', '6', '11', '31', '39', '42', '1', '3']
['19-09-2017', '1', '8', '21', '30', '45', '2', '3']
['15-09-2017', '13', '18', '37', '44', '49', '9', '12']
['12-09-2017', '10', '17', '27', '29', '35', '4', '11']
['08-09-2017', '9', '24', '42', '47', '49', '1', '5']
['05-09-2017', '6', '9', '18', '28', '29', '1', '9']
['01-09-2017', '3', '7', '8', '14', '49', '5', '8']
['29-08-2017', '4', '12', '15', '32', '38', '1', '5']
['25-08-2017', '1', '5', '7', '15', '47', '9', '12']
['22-08-2017', '3', '10', '12', '17', '27', '3', '5']
['18-08-2017', '2', '24', '39', '42', '45', '2', '8']
['15-08-2017', '10', '14', '30', '35', '46', '4', '10']
['11-08-2017', '18', '28', '39', '46', '48', '5', '12']
['08-08-2017', '15', '25', '26', '40', '41', '4', '5']
['04-08-2017', '29', '30', '36', '40', '41', '2', '9']
['01-08-2017', '14', '21', '24', '29', '30', '8', '10']
['28-07-2017', '5', '9', '29', '31', '41', '2', '4']
['25-07-2017', '12', '14', '43', '44', '48', '2', '11']
['21-07-2017', '1', '8', '9', '26', '49', '5', '9']
['18-07-2017', '1', '25', '27', '41', '45', '5', '7']
['14-07-2017', '11', '14', '20', '21', '47', '7', '10']
['11-07-2017', '14', '22', '26', '42', '50', '8', '10']
['07-07-2017', '11', '20', '35', '37', '45', '3', '6']
['04-07-2017', '10', '22', '25', '37', '49', '5', '8']
['30-06-2017', '17', '35', '39', '47', '50', '6', '8']
['27-06-2017', '9', '17', '21', '28', '45', '1', '3']
['23-06-2017', '3', '4', '21', '31', '38', '3', '7']
['20-06-2017', '11', '18', '26', '43', '44', '8', '10']
['16-06-2017', '15', '17', '38', '41', '42', '9', '12']
['13-06-2017', '3', '12', '22', '27', '49', '4', '11']
['09-06-2017', '9', '20', '27', '39', '43', '10', '11']
['06-06-2017', '20', '22', '25', '37', '40', '3', '7']
['02-06-2017', '8', '10', '24', '33', '42', '3', '9']
['30-05-2017', '7', '12', '27', '38', '48', '6', '9']
['26-05-2017', '5', '7', '26', '36', '39', '2', '10']
['23-05-2017', '8', '15', '25', '27', '42', '1', '4']
['19-05-2017', '9', '11', '12', '19', '30', '4', '9']
['16-05-2017', '8', '11', '15', '20', '30', '3', '8']
['12-05-2017', '2', '20', '28', '29', '44', '3', '9']
['09-05-2017', '8', '12', '16', '22', '26', '6', '7']
['05-05-2017', '3', '7', '30', '35', '43', '1', '3']
['02-05-2017', '6', '19', '23', '25', '27', '11', '12']
['28-04-2017', '14', '20', '25', '30', '39', '2', '8']
['25-04-2017', '9', '11', '19', '32', '43', '3', '9']
['21-04-2017', '2', '13', '16', '22', '49', '4', '5']
['18-04-2017', '17', '22', '31', '38', '45', '5', '12']
['14-04-2017', '4', '14', '20', '23', '33', '6', '10']
['11-04-2017', '5', '21', '22', '31', '49', '2', '8']
['07-04-2017', '2', '10', '19', '35', '50', '6', '7']
['04-04-2017', '1', '9', '24', '33', '34', '2', '6']
['31-03-2017', '17', '24', '26', '28', '45', '4', '12']
['28-03-2017', '9', '13', '31', '33', '46', '6', '10']
['24-03-2017', '2', '17', '21', '27', '34', '5', '9']
['21-03-2017', '1', '20', '23', '44', '47', '4', '11']
['17-03-2017', '6', '10', '19', '29', '36', '3', '9']
['14-03-2017', '3', '5', '21', '36', '44', '3', '6']
['10-03-2017', '31', '36', '38', '47', '49', '8', '11']
['07-03-2017', '6', '37', '41', '48', '50', '4', '5']
['03-03-2017', '2', '11', '29', '30', '47', '1', '12']
['28-02-2017', '10', '20', '31', '35', '42', '2', '12']
['24-02-2017', '2', '4', '13', '22', '43', '8', '9']
['21-02-2017', '13', '19', '41', '45', '49', '3', '4']
['17-02-2017', '19', '25', '33', '36', '48', '2', '9']
['14-02-2017', '2', '10', '24', '40', '44', '3', '10']
['10-02-2017', '7', '21', '26', '35', '43', '2', '9']
['07-02-2017', '4', '10', '31', '38', '44', '8', '10']
['03-02-2017', '3', '4', '15', '46', '50', '5', '9']
['31-01-2017', '3', '4', '17', '23', '44', '6', '9']
['27-01-2017', '17', '20', '28', '45', '48', '5', '9']
['24-01-2017', '1', '5', '7', '17', '23', '3', '8']
['20-01-2017', '10', '17', '27', '31', '49', '3', '5']
['17-01-2017', '4', '16', '25', '43', '47', '2', '10']
['13-01-2017', '3', '7', '16', '26', '50', '4', '7']
['10-01-2017', '2', '11', '29', '35', '44', '4', '9']
['06-01-2017', '10', '14', '18', '21', '49', '9', '11']
['03-01-2017', '19', '23', '27', '34', '49', '1', '11']
try passing HTML text directly
soup = BeautifulSoup(html.text)
Basically the issue is as follows: I have a bunch of workers that have a function prescribed to each (the function is worker(alist) ) and am trying to process 35 workers at the same time. Each worker reads their line from the file (the modulo part) and should process the line using the "worker" function. I've pen-tested and found that the raw manipulation and deletion of the useless indices is working 100% as intended.
The args part of the "pool.apply_async" function isn't passing the list "raw" into it and starting the process. Raw is completely correct and functions normally, worker by itself functions normally, the pool.apply_async function is the only place that there seems to be an issue and I have no idea how to fix it. Any help please?
The relevant code is here:
NUM_WORKERS=35
f=open("test.csv")
pool=multiprocessing.Pool()
open("final.csv",'w')
for workernumber in range(1, NUM_WORKERS):
for i,line in enumerate(f):
if i==0:
print "Skipping first line" #dont do anything
elif i%workernumber==0:
raw = line.split(',')[0][1:-1].split()
uselessindices=[-2,-3,-4,-5,-6]
counter=0
for ui in uselessindices:
del raw[ui+counter]
counter+=1
print raw
pool.apply_async(worker, args=(raw,))
pool.close()
pool.join()
I suggest you put the calculation of raw into a generator function, and then use Pool.imap_unordered() or Pool.map() to run worker() over all of the items in the generator.
Something like this untested code:
def get_raw():
with open("test.csv", 'rU') as f:
for i, line in enumerate(f):
if i == 0:
# skip header
continue
raw = line.split(',')[0][1:-1].split()
uselessindices=[-2,-3,-4,-5,-6]
counter=0
for ui in uselessindices:
del raw[ui+counter]
counter+=1
yield raw
pool=multiprocessing.Pool(processes=NUM_WORKERS)
pool.map(worker, get_raw())
pool.close()
pool.join()
import multiprocessing
def worker(arg):
print 'doing work "%s"' % arg
return
NUM_WORKERS=35
with open('test.csv', 'w') as test:
for i in xrange(100):
if i % 10 == 0:
test.write('\n')
test.write('"%s 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23",' % i)
f=open("test.csv")
pool=multiprocessing.Pool(processes=NUM_WORKERS)
open("final.csv",'w')
for i, line in enumerate(f):
if i == 0:
continue
raw = line.split(',')[0][1:-1].split()
uselessindices=[-2,-3,-4,-5,-6]
counter=0
for ui in uselessindices:
del raw[ui+counter]
counter+=1
pool.apply_async(worker, args=(raw,))
pool.close()
pool.join()
print 'last raw len: %s' % len(raw)
print 'last raw value: %s' % raw
Output:
doing work "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['10', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['20', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['30', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['40', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['50', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['60', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['70', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['80', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
doing work "['90', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']"
last raw len: 19
last raw value: ['90', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '23']
So I found out it wasn't throwing an error that was occurring inside the worker as a result of a mismatched number of inputs into a child function (AKA worker was calling another function dosomething(a1,a2,...a20) and was only giving it 19 inputs). It seems async won't throw error outputs about issues happening inside the worker which is quite annoying but I now understand. Thanks for all the help!
Illinois: ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3']
Indiana: ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']
Those are in my dictionary as d.
How would I get the largest and smallest value in each key in the dictionary and get the index where's the value is.
For example:
In Illinois, 26 is the largest value which is index 5 and 3 is the smallest value which is index 15.
in Indiana: 13 is largest value which is index 7 and 2 is the smallest value which is index 14
The output:
Illinois: 26 in index 5 and 3 in index 15
Indiana: 13 in index 7 and 2 in index 14
How would I do this?
d = {}
for row in csv_f:
d[row[0]]=row[1:]
You can get the max and mins printed out as your string is like this:
(assuming you only want the first occurrence)
MY_D = {'Illinois': ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3'],
'Indiana': ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']}
for k,v in MY_D.items():
#This assumes that everything in v is an int, or rather can be converted to one.
my_l = [int(n) for n in v]
#if not
#my_l = [int(n) for n in v if n.isdigit()]
_max, _min = max(my_l), min(my_l)
print("%s: Min - %d in index %d, Max - %d in index %d" % (k, _min, my_l.index(_min), _max, my_l.index(_max)))
Here is a solution returning a dict {country: (maxval, index), (minval, index))}:
d = {
'Illinois': ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3'],
'Indiana': ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']
}
maxmin = {}
for state, numbers in d.items():
maxmin[state] = (
max(enumerate(numbers), key=lambda x: int(x[1])),
min(enumerate(numbers), key=lambda x: int(x[1]))
)
print(maxmin)
Bit thrown together, but seems to do the job.
d = {"Illinois": ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3'],
"Indiana": ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']}
if __name__ == "__main__":
print d
for state in d:
# returns the numbers with their index (#, index)
pairs = [(int(d[state][x]), x) for x in xrange(len(d[state]))]
minpair = min(pairs)
maxpair = max(pairs)
print "%s: %d in index %d and %d in index %d"%(state,maxpair[0],maxpair[1],
minpair[0],minpair[1])
Output:
{'Indiana': ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2'], 'Illinois': ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3']}
Indiana: 13 in index 6 and 2 in index 13
Illinois: 26 in index 4 and 3 in index 14
to get around the blank string, you could break up the list comprehension into
pairs = []
for x in xrange(len(d[state])):
try:
pairs.append( (int(d[state][x]), x) )
except ValueError:
pass # not a valid number