Related
I have csv file that looks like this:
id,type,inc,a,b
1,2,63376.799999999996,0.3061,187
2,1,58087.700000000004,0.3361,178
3,1,52699.9,0.3482,181
4,2,72964.8,0.3186,177
5,4,111589.79999999999,0.268,154
6,4,107618.0,0.2583,150
7,2,87109.2,0.3233,193
8,2,84669.59999999999,0.308,179
9,2,77258.4,0.3247,173
I need to transform values from fields [3], [4] and [5] that are 'inc', 'a' and 'b' into float and than append to households to create a list.
Finally it should print len(households)
I am trying this
import csv
import scipy.optimize as opt
def read_households(filename):
households=[]
fh = open("households.csv", 'r')
reader = csv.DictReader(fh) #creates object 'reader' to read the file
for hh in reader:
reader[3,4,5] = float(reader[3,4,5])
households.append(hh)
print("lines read:", len(households))
return(household)
It does not give me anything.
Try this code for get some output from CSV.
import csv
import scipy.optimize as opt
def read_households(filename):
households=[]
fh = open(filename, 'r')
reader = csv.DictReader(fh)
for hh in reader:
hh['inc'] = float(hh['inc'])
hh['a'] = float(hh['a'])
hh['b'] = float(hh['b'])
households.append(hh)
print("lines read:", len(households))
return households
# call function
read_households('path_of_csv_file')
Result :
lines read: 9
[{'id': '1', 'type': '2', 'inc': 63376.799999999996, 'a': 0.3061, 'b': 187.0},
{'id': '2', 'type': '1', 'inc': 58087.700000000004, 'a': 0.3361, 'b': 178.0},
{'id': '3', 'type': '1', 'inc': 52699.9, 'a': 0.3482, 'b': 181.0},
{'id': '4', 'type': '2', 'inc': 72964.8, 'a': 0.3186, 'b': 177.0},
{'id': '5', 'type': '4', 'inc': 111589.79999999999, 'a': 0.268, 'b': 154.0},
{'id': '6', 'type': '4', 'inc': 107618.0, 'a': 0.2583, 'b': 150.0},
{'id': '7', 'type': '2', 'inc': 87109.2, 'a': 0.3233, 'b': 193.0},
{'id': '8', 'type': '2', 'inc': 84669.59999999999, 'a': 0.308, 'b': 179.0},
{'id': '9', 'type': '2', 'inc': 77258.4, 'a': 0.3247, 'b': 173.0}]
If you can use pandas then this is a very easy way:
import pandas as pd
df = pd.read_csv('file.csv')
df = df.astype({'inc': 'float', 'a': 'float', 'b': 'float'})
print(df)
print(f"Lines read: {df.shape[0]}")
When run gives:
id type inc a b
0 1 2 63376.8 0.3061 187.0
1 2 1 58087.7 0.3361 178.0
2 3 1 52699.9 0.3482 181.0
...
8 9 2 77258.4 0.3247 173.0
Lines read: 9
If you need the output in the list of dictionaries output:
households = df.to_dict('records')
and we are done
I am doing a web scraping for top 10 teams icc, i got same class for both points and matches
"td",class_='table-body__cell u-center-text'
how do i split this
page=requests.get(url1)
page
soup1 = BeautifulSoup(page.content,"html.parser") print(soup1.prettify())
matches = []
for i in soup1.find_all("td",class_='rankings-block__banner-matches'):
matches.append(i.text)
matches
Simple way use pandas
You can use pandas to read the table into a dataframe and pick the values you want:
import pandas as pd
pd.read_html('https://www.icc-cricket.com/rankings/mens/team-rankings/odi/')[0]
Alternative with bs4
matches = [x.get_text() for x in soup.select('table.table tr td:nth-of-type(3)')]
points = [x.get_text() for x in soup.select('table.table tr td:nth-of-type(4)')]
print(matches, points)
or
matches=[]
points=[]
for x in soup.select('table.table tr')[1:]:
matches.append(x.select_one('td:nth-of-type(3)').get_text())
points.append(x.select_one('td:nth-of-type(4)').get_text())
print(matches, points)
A complete solution, just run this code and you will get a dictionary with all the data from the table organized nicely:
# get the entire table
table = soup1.find('table', {'class': 'table'})
# create dictionary to hold results
rankings = {}
# separate first row since it uses different markup than the rest
position = table.find('td', {'class': 'rankings-block__banner--pos'}).text.strip()
country_name = table.find('span', {'class': 'u-hide-phablet'}).text.strip()
matches = table.find('td', {'class': 'rankings-block__banner--matches'}).text.strip()
points = table.find('td', {'class': 'rankings-block__banner--points'}).text.strip()
rating = table.find('td', {'class': 'rankings-block__banner--rating u-text-right'}).text.strip()
rankings[country_name] = {'position': position,
'matches': matches,
'points': points,
'rating': rating}
# for the next rows, use a loop
for row in table.find_all('tr', {'class': 'table-body'}):
position = row.find('td', {'class': 'table-body__cell table-body__cell--position u-text-right'}).text.strip()
country_name = row.find('span', {'class': 'u-hide-phablet'}).text.strip()
matches = row.find_all('td', {'class': 'table-body__cell u-center-text'})[0].text.strip()
points = row.find_all('td', {'class': 'table-body__cell u-center-text'})[1].text.strip()
rating = row.find('td', {'class': 'table-body__cell u-text-right rating'}).text.strip()
rankings[country_name] = {'position': position,
'matches': matches,
'points': points,
'rating': rating}
rankings
Which outputs:
{'New Zealand': {'position': '1',
'matches': '17',
'points': '2,054',
'rating': '121'},
'England': {'position': '2',
'matches': '32',
'points': '3,793',
'rating': '119'},
'Australia': {'position': '3',
'matches': '28',
'points': '3,244',
'rating': '116'},
'India': {'position': '4',
'matches': '32',
'points': '3,624',
'rating': '113'},
'South Africa': {'position': '5',
'matches': '25',
'points': '2,459',
'rating': '98'},
'Pakistan': {'position': '6',
'matches': '27',
'points': '2,524',
'rating': '93'},
'Bangladesh': {'position': '7',
'matches': '30',
'points': '2,740',
'rating': '91'},
'West Indies': {'position': '8',
'matches': '30',
'points': '2,523',
'rating': '84'},
'Sri Lanka': {'position': '9',
'matches': '32',
'points': '2,657',
'rating': '83'},
'Afghanistan': {'position': '10',
'matches': '17',
'points': '1,054',
'rating': '62'},
'Netherlands': {'position': '11',
'matches': '7',
'points': '336',
'rating': '48'},
'Ireland': {'position': '12',
'matches': '25',
'points': '1,145',
'rating': '46'},
'Oman': {'position': '13', 'matches': '11', 'points': '435', 'rating': '40'},
'Scotland': {'position': '14',
'matches': '8',
'points': '308',
'rating': '39'},
'Zimbabwe': {'position': '15',
'matches': '20',
'points': '764',
'rating': '38'},
'Nepal': {'position': '16', 'matches': '11', 'points': '330', 'rating': '30'},
'UAE': {'position': '17', 'matches': '9', 'points': '190', 'rating': '21'},
'United States': {'position': '18',
'matches': '14',
'points': '232',
'rating': '17'},
'Namibia': {'position': '19', 'matches': '6', 'points': '97', 'rating': '16'},
'Papua New Guinea': {'position': '20',
'matches': '10',
'points': '0',
'rating': '0'}}
In addition, you can also add it to a pandas dataframe for better analysis:
pd.DataFrame(rankings)
Which outputs:
New Zealand England Australia India South Africa Pakistan Bangladesh West Indies Sri Lanka Afghanistan Netherlands Ireland Oman Scotland Zimbabwe Nepal UAE United States Namibia Papua New Guinea
position 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
matches 17 32 28 32 25 27 30 30 32 17 7 25 11 8 20 11 9 14 6 10
points 2,054 3,793 3,244 3,624 2,459 2,524 2,740 2,523 2,657 1,054 336 1,145 435 308 764 330 190 232 97 0
rating 121 119 116 113 98 93 91 84 83 62 48 46 40 39 38 30 21 17 16 0
Here is an original dataframe of 2 rows consisting of ID and ColumnA. Some row may have one detail.
ID ColumnA
1 {'1': {'Order': '0', 'Result': ''},
'2': {'Order': 'Yellow', 'Result': 'Red'},
'3': {'Order': 'Clear', 'Result': 'Tight'},
'4': {'Order': '1.000-1.030', 'Result': '1.015'}}
2 {'1': {'Order': '0', 'Result': '1.015'},
'4': {'Order': '1.000-1.030', 'Result': '2.4'},
'5': {'Order': '6.0-7.0', 'Result': ''},
'6': {'Order': 'Negative', 'Result': 'Negative'},
'7': {'Order': 'Negative', 'Result': 'Negative'},
'8': {'Order': 'Negative', 'Result': 'Positive'},
'9': {'Order': 'Negative', 'Result': ''}}
I want to extract from ColumnA to new dataframe
ID Column_ID Column_Order ColumnD_Result
1 1 0
1 2 Yellow Red
1 3 Clear Tight
1 4 1.000-1.030 1.015
2 1 0 1.015
2 4 1.000-1.030 2.4
2 5 6.0-7.0
2 6 Negative Negative
2 7 Negative Negative
2 8 Negative Positive
2 9 Negative
How to write the extraction of dictionary?
Extraction by looping on dictionary items:
import pandas as pd
data = [
['1', {'1': {'Order': '0', 'Result': ''},
'2': {'Order': 'Yellow', 'Result': 'Red'},
'3': {'Order': 'Clear', 'Result': 'Tight'},
'4': {'Order': '1.000-1.030', 'Result': '1.015'}}],
['2', {'1': {'Order': '0', 'Result': '1.015'},
'4': {'Order': '1.000-1.030', 'Result': '2.4'},
'5': {'Order': '6.0-7.0', 'Result': ''},
'6': {'Order': 'Negative', 'Result': 'Negative'},
'7': {'Order': 'Negative', 'Result': 'Negative'},
'8': {'Order': 'Negative', 'Result': 'Positive'},
'9': {'Order': 'Negative', 'Result': ''}}]]
df = pd.DataFrame(data, columns=['ID', 'ColumnA'])
dfColumnA = pd.DataFrame([], columns=['ID', 'Column_ID', 'Column_Order', 'ColumnD_Result'])
i = 0
for index, row in df.iterrows():
dictColumA = row['ColumnA']
for column_ID, v in dictColumA.items():
dfColumnA.loc[i] = [row['ID'], column_ID, v['Order'], v['Result']]
i += 1
print(dfColumnA)
Output:
ID Column_ID Column_Order ColumnD_Result
0 1 1 0
1 1 2 Yellow Red
2 1 3 Clear Tight
3 1 4 1.000-1.030 1.015
4 2 1 0 1.015
5 2 4 1.000-1.030 2.4
6 2 5 6.0-7.0
7 2 6 Negative Negative
8 2 7 Negative Negative
9 2 8 Negative Positive
10 2 9 Negative
I have a list of dict like:
data = [
{'ID': '000681', 'type': 'B:G+', 'testA': '11'},
{'ID': '000682', 'type': 'B:G+', 'testA': '-'},
{'ID': '000683', 'type': 'B:G+', 'testA': '13'},
{'ID': '000684', 'type': 'B:G+', 'testA': '14'},
{'ID': '000681', 'type': 'B:G+', 'testB': '15'},
{'ID': '000682', 'type': 'B:G+', 'testB': '16'},
{'ID': '000683', 'type': 'B:G+', 'testB': '17'},
{'ID': '000684', 'type': 'B:G+', 'testB': '-'}
]
How to use Pandas to get data like:
data = [
{'ID': '000683', 'type': 'B:G+', 'testA': '13', 'testB': '17'},
{'ID': '000681', 'type': 'B:G+', 'testA': '11', 'testB': '15'},
{'ID': '000684', 'type': 'B:G+', 'testA': '14', 'testB': '-'},
{'ID': '000682', 'type': 'B:G+', 'testA': '-', 'testB': '16'}
]
Same ID and same type to one col and sorted by testA and testB values
sorted : both testA and testB have value and lager value of testA+testB at the top.
First convert columns to numeric with replace non numeric to integers and then aggregate sum:
df = pd.DataFrame(data)
c = ['testA','testB']
df[c] = df[c].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df1 = df.groupby(['ID','type'])[c].sum(min_count=1).sort_values(c).fillna('-').reset_index()
print (df1)
ID type testA testB
0 000681 B:G+ 11 15
1 000683 B:G+ 13 17
2 000684 B:G+ 14 -
3 000682 B:G+ - 16
If want sorting by sum of both columns use Series.argsort:
df = pd.DataFrame(data)
c = ['testA','testB']
df[c] = df[c].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df2 = df.groupby(['ID','type'])[c].sum(min_count=1)
df2 = df2.iloc[(-df2).sum(axis=1).argsort()].fillna('-').reset_index()
print (df2)
ID type testA testB
0 000683 B:G+ 13 17
1 000681 B:G+ 11 15
2 000682 B:G+ - 16
3 000684 B:G+ 14 -
I am creating a function that grabs data from an ERP system to display to the end user.
I want to unpack an object of dictionaries and create a range of Pandas DataFrames with them.
For example, I have:
troRows
{0: [{'productID': 134336, 'price': '10.0000', 'amount': '1', 'cost': 0}],
1: [{'productID': 142141, 'price': '5.5000', 'amount': '4', 'cost': 0}],
2: [{'productID': 141764, 'price': '5.5000', 'amount': '1', 'cost': 0}],
3: [{'productID': 81661, 'price': '4.5000', 'amount': '1', 'cost': 0}],
4: [{'productID': 146761, 'price': '5.5000', 'amount': '1', 'cost': 0}],
5: [{'productID': 143585, 'price': '5.5900', 'amount': '9', 'cost': 0}],
6: [{'productID': 133018, 'price': '5.0000', 'amount': '1', 'cost': 0}],
7: [{'productID': 146250, 'price': '13.7500', 'amount': '5', 'cost': 0}],
8: [{'productID': 149986, 'price': '5.8900', 'amount': '2', 'cost': 0},
{'productID': 149790, 'price': '4.9900', 'amount': '2', 'cost': 0},
{'productID': 149972, 'price': '5.2900', 'amount': '2', 'cost': 0},
{'productID': 149248, 'price': '2.0000', 'amount': '2', 'cost': 0},
{'productID': 149984, 'price': '4.2000', 'amount': '2', 'cost': 0},
Each time the function will need to unpack x number of dictionaries which may have different number of rows into a range of DataFrames.
So for example, this range of Dictionaries would return
DF0, DF1, DF2, DF3, DF4, DF5, DF6, DF7, DF8.
I can unpack a single Dictionary with:
pd.DataFrame(troRows[8])
which returns
amount cost price productID
0 2 0 5.8900 149986
1 2 0 4.9900 149790
2 2 0 5.2900 149972
3 2 0 2.0000 149248
4 2 0 4.2000 149984
How can I structure my code so that it does this for all the dictionaries for me?
Solution for dictionary of DataFrames - use dictioanry comprehension and set index values to keys of dictionary:
dfs = {k: pd.DataFrame(v) for k, v in troRows.items()}
print (dfs)
{0: amount cost price productID
0 1 0 10.0000 134336, 1: amount cost price productID
0 4 0 5.5000 142141, 2: amount cost price productID
0 1 0 5.5000 141764, 3: amount cost price productID
0 1 0 4.5000 81661, 4: amount cost price productID
0 1 0 5.5000 146761, 5: amount cost price productID
0 9 0 5.5900 143585, 6: amount cost price productID
0 1 0 5.0000 133018, 7: amount cost price productID
0 5 0 13.7500 146250, 8: amount cost price productID
0 2 0 5.8900 149986
1 2 0 4.9900 149790
2 2 0 5.2900 149972
3 2 0 2.0000 149248
4 2 0 4.2000 149984}
print (dfs[8])
amount cost price productID
0 2 0 5.8900 149986
1 2 0 4.9900 149790
2 2 0 5.2900 149972
3 2 0 2.0000 149248
4 2 0 4.2000 149984
Solutions for one DataFrame:
Use list comprehension with flattening and pass it to DataFrame constructor:
troRows = pd.Series([[{'productID': 134336, 'price': '10.0000', 'amount': '1', 'cost': 0}],
[{'productID': 142141, 'price': '5.5000', 'amount': '4', 'cost': 0}],
[{'productID': 141764, 'price': '5.5000', 'amount': '1', 'cost': 0}],
[{'productID': 81661, 'price': '4.5000', 'amount': '1', 'cost': 0}],
[{'productID': 146761, 'price': '5.5000', 'amount': '1', 'cost': 0}],
[{'productID': 143585, 'price': '5.5900', 'amount': '9', 'cost': 0}],
[{'productID': 133018, 'price': '5.0000', 'amount': '1', 'cost': 0}],
[{'productID': 146250, 'price': '13.7500', 'amount': '5', 'cost': 0}],
[{'productID': 149986, 'price': '5.8900', 'amount': '2', 'cost': 0},
{'productID': 149790, 'price': '4.9900', 'amount': '2', 'cost': 0},
{'productID': 149972, 'price': '5.2900', 'amount': '2', 'cost': 0},
{'productID': 149248, 'price': '2.0000', 'amount': '2', 'cost': 0},
{'productID': 149984, 'price': '4.2000', 'amount': '2', 'cost': 0}]])
df = pd.DataFrame([y for x in troRows for y in x])
Another solution for flatten your data is use chain.from_iterable:
from itertools import chain
df = pd.DataFrame(list(chain.from_iterable(troRows)))
print (df)
amount cost price productID
0 1 0 10.0000 134336
1 4 0 5.5000 142141
2 1 0 5.5000 141764
3 1 0 4.5000 81661
4 1 0 5.5000 146761
5 9 0 5.5900 143585
6 1 0 5.0000 133018
7 5 0 13.7500 146250
8 2 0 5.8900 149986
9 2 0 4.9900 149790
10 2 0 5.2900 149972
11 2 0 2.0000 149248
12 2 0 4.2000 149984