import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
coauthors_dict = {}
url_str = ('https://api.semanticscholar.org/graph/v1/author/47490276?fields=name,papers.authors')
respons = urllib.request.urlopen(url_str)
text = respons.read().decode()
for line in respons:
print(line.decode().rstip())
data = json.loads(text)
print(type(data))
print(list(data.keys()))
print(data["name"])
print(data["authorId"])
name = []
for lines in data["papers"]:
for authors in lines["authors"]:
name.append(authors.get("name"))
print(name)
count = dict()
names = name
for i in names:
if i not in count:
count[i] = 1
else:
count[i] += 1
print(count)
c = Counter(count)
top = c.most_common(10)
print(top)
return coauthors_dict
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc.items(), key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
This is how my code looks this far, there are no error. I need to get rid of the rest of the text when I run it, so it should look like this:
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
I have tried using rstrip and split on my 'c' variable but it doesn't work. Im only allowed importing what I already have imported and must use the link which is included.
Tips on simplifying or bettering the code is also appreciated!
("Extend the program below so that it prints the names of the top-10 coauthors together with the numbers of the coauthored publications")
From what I understand you are not quite sure where your successful output originates from. It is not the 5 lines at the end.
Your result is printed by the print(top) on line 39. This top variable is what you want to return from the function, as the coauthors_dict you are currently returning never actually gets any data written to it.
You will also have to slightly adjust your sorted(...) as you now have a list and not a dictionary, but you should then get the correct result.
If I understand correctly you are wanting this function to return a count of each distinct co-author (excluding the author), which it seems like you already have in your count variable, which you don't return. The variable you DO return is empty.
Instead consider:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
#The statement above can be written long-hand like:
#names=[]
#for l in data["papers"]:
# for a in l["authors"]:
# if a['authorId'] != author_id:
# names.append(a.get("name"))
return list(Counter(names).items())
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc, key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
You might also consider moving the top N logic into the function as an optional paramter:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id, top=0):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
name_count = list(Counter(names).items())
top = top if top!=0 else len(name_count)
return sorted(name_count, key=lambda x: x[1], reverse=True)[:top]
author_id = '47490276'
for auth in count_coauthors(author_id, top=10):
print(auth)
Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?
def my_function_1(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data"].append(random_dict)
def my_function_2(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data2"].append(random_dict)
dictionary_list = []
for v in values:
df_1_rows = df_1_rows[(df_1_rows.values == v)]
df_2_rows = df_2_rows[(df_2_rows.values == v)]
semi_dict = dict(value=v, data=[], data2=[])
function = partial(my_function_1, semi_dict)
function_2 = partial(my_function_2, semi_dict)
df_1_rows.apply(lambda row : function(row), axis=1)
df_2_rows.apply(lambda row : function_2(row), axis=1)
dictionary_list.append(semi_dict)
This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:
import pandas as pd
import random
len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")
df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])
def make_dict(df):
# some calculations on the df
return {
'data': df.head(1).values.tolist(),
}
def make_dict_2(df):
# some calculations on the df
return {
'data_2': df.head(1).values.tolist(),
}
def merge(a, b, path=None):
"merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()
result = merge(dict1, dict2)
result
I have a list of lists containing company objects:
companies_list = [companies1, companies2]
I have the following function:
def get_fund_amount_by_year(companies_list):
companies_length = len(companies_list)
for idx, companies in enumerate(companies_list):
companies1 = companies.values_list('id', flat=True)
funding_rounds = FundingRound.objects.filter(company_id__in=companies1).order_by('announced_on')
amount_per_year_list = []
for fr in funding_rounds:
fr_year = fr.announced_on.year
fr_amount = fr.raised_amount_usd
if not any(d['year'] == fr_year for d in amount_per_year_list):
year_amount = {}
year_amount['year'] = fr_year
for companies_idx in range(companies_length):
year_amount['amount'+str(companies_idx)] = 0
if companies_idx == idx:
year_amount['amount'+str(companies_idx)] = fr_amount
amount_per_year_list.append(year_amount)
else:
for year_amount in amount_per_year_list:
if year_amount['year'] == fr_year:
year_amount['amount'+str(idx)] += fr_amount
return amount_per_year_list
The problem is the resulting list of dictionaries has only one amount attribute updated.
As you can see "amount0" contains all "0" amounts:
[{'amount1': 12100000L, 'amount0': 0, 'year': 1999}, {'amount1':
8900000L, 'amount0': 0, 'year': 2000}]
What am I doing wrong?
I put list of dictionaries being built in the loop and so when it iterated it overwrote the last input. I changed it to look like:
def get_fund_amount_by_year(companies_list):
companies_length = len(companies_list)
**amount_per_year_list = []**
for idx, companies in enumerate(companies_list):
companies1 = companies.values_list('id', flat=True)
funding_rounds = FundingRound.objects.filter(company_id__in=companies1).order_by('announced_on')
I'm not entirely sure why im getting a dictionary key error. I'm trying to create a multi level dict with = sign and getting a key error on metrics, but not on the first two.
doc['timestamp']
and
doc['instance_id']
both work fine, but when it gets to metrics it gives me a metrics key error. I'm not entirely sure why.
doc = {}
doc['timestamp'] = datetime.now()
#doc['instance_id'] = get_cloud_app_name()
doc['instance_id'] = "MyMac"
cpu_dict_returned = get_cpu_info()
doc['metrics']['cpu_usage']['user_cpu'] = cpu_dict_returned['user_cpu']
doc['metrics']["cpu_usage"]['system_cpu'] = cpu_dict_returned['system_cpu']
doc['metrics']["cpu_usage"]['idle_cpu'] = cpu_dict_returned['idle_cpu']
doc['metrics']["cpu_usage"]['cpu_count'] = cpu_dict_returned['cpu_count']
You must create the sub-dictionnaries before using them:
doc = {}
doc['timestamp'] = datetime.now()
doc['instance_id'] = "MyMac"
cpu_dict_returned = get_cpu_info()
doc['metrics'] = {}
doc['metrics']['cpu_usage'] = {}
doc['metrics']['cpu_usage']['user_cpu'] = cpu_dict_returned['user_cpu']
doc['metrics']["cpu_usage"]['system_cpu'] = cpu_dict_returned['system_cpu']
doc['metrics']["cpu_usage"]['idle_cpu'] = cpu_dict_returned['idle_cpu']
doc['metrics']["cpu_usage"]['cpu_count'] = cpu_dict_returned['cpu_count']
You can do this more succinctly using a dictionary comprehension:
doc = {}
doc['timestamp'] = datetime.now()
doc['instance_id'] = "MyMac"
cpu_dict_returned = get_cpu_info()
doc['metrics'] = {
'cpu_usage':
{k: cpu_dict_returned.get(k)
for k in ['user_cpu', 'system_cpu', 'idle_cpu', 'cpu_count']}
}
Note that the sub dictionary cpu_usage is first created, and then the nested dictionary is inserted.
I have a list of data points that contains a measurement every 5 minutes for 24 hours. I need to create a new list with the average of that measurement for each hour in the list. What's the best way to accomplish that?
Date Amount
2015-03-14T00:00:00.000-04:00 12545.869
2015-03-14T00:05:00.000-04:00 12467.326
2015-03-14T00:10:00.000-04:00 12416.948
2015-03-14T00:15:00.000-04:00 12315.698
2015-03-14T00:20:00.000-04:00 12276.38
2015-03-14T00:25:00.000-04:00 12498.696
2015-03-14T00:30:00.000-04:00 12426.145
2015-03-14T00:35:00.000-04:00 12368.659
2015-03-14T00:40:00.000-04:00 12322.785
2015-03-14T00:45:00.000-04:00 12292.719
2015-03-14T00:50:00.000-04:00 12257.965
2015-03-14T00:55:00.000-04:00 12221.375
2015-03-14T01:00:00.000-04:00 12393.725
2015-03-14T01:05:00.000-04:00 12366.674
2015-03-14T01:10:00.000-04:00 12378.578
2015-03-14T01:15:00.000-04:00 12340.754
2015-03-14T01:20:00.000-04:00 12288.511
2015-03-14T01:25:00.000-04:00 12266.136
2015-03-14T01:30:00.000-04:00 12236.639
2015-03-14T01:35:00.000-04:00 12181.668
2015-03-14T01:40:00.000-04:00 12171.992
2015-03-14T01:45:00.000-04:00 12164.298
2015-03-14T01:50:00.000-04:00 12137.282
2015-03-14T01:55:00.000-04:00 12116.486
2015-03-14T02:00:02.000-04:00 12090.439
2015-03-14T02:05:00.000-04:00 12085.924
2015-03-14T02:10:00.000-04:00 12034.78
2015-03-14T02:15:00.000-04:00 12037.367
2015-03-14T02:20:00.000-04:00 12006.649
2015-03-14T02:25:00.000-04:00 11985.588
2015-03-14T02:30:00.000-04:00 11999.41
2015-03-14T02:35:00.000-04:00 11943.121
2015-03-14T02:40:00.000-04:00 11934.346
2015-03-14T02:45:00.000-04:00 11928.568
2015-03-14T02:50:00.000-04:00 11918.63
2015-03-14T02:55:00.000-04:00 11885.698
2015-03-14T03:00:00.000-04:00 11863.065
2015-03-14T03:05:00.000-04:00 11883.256
2015-03-14T03:10:00.000-04:00 11870.095
2015-03-14T03:15:00.000-04:00 11849.104
2015-03-14T03:20:00.000-04:00 11849.18
2015-03-14T03:25:00.000-04:00 11834.229
2015-03-14T03:30:00.000-04:00 11826.603
2015-03-14T03:35:00.000-04:00 11823.516
2015-03-14T03:40:00.000-04:00 11849.386
2015-03-14T03:45:00.000-04:00 11832.385
2015-03-14T03:50:00.000-04:00 11847.059
2015-03-14T03:55:00.000-04:00 11831.807
2015-03-14T04:00:00.000-04:00 11844.027
2015-03-14T04:05:00.000-04:00 11873.114
2015-03-14T04:10:00.000-04:00 11904.105
2015-03-14T04:15:00.000-04:00 11879.018
2015-03-14T04:20:00.000-04:00 11899.658
2015-03-14T04:25:00.000-04:00 11887.808
2015-03-14T04:30:00.000-04:00 11879.875
2015-03-14T04:35:00.000-04:00 11924.149
2015-03-14T04:40:00.000-04:00 11929.499
2015-03-14T04:45:00.000-04:00 11932.086
2015-03-14T04:50:00.000-04:00 11989.847
2015-03-14T04:55:00.000-04:00 12000.971
This is a beautiful use of itertools.groupby because you can actually take advantage of the generators it returns instead of instantly making them lists or something:
import itertools, pprint
d = {}
for (key,gen) in itertools.groupby(lst, key=lambda l: int(l[0][11:13])):
d[key] = sum(v for (d,v) in gen)
pprint.pprint(d)
And for average instead of sum:
import itertools, pprint
def avg(gf):
_sum = 0
for (i,e) in enumerate(gf): _sum += e
return float(_sum) / (i+1)
d = {}
for (key,gen) in itertools.groupby(lst, key=lambda l: int(l[0][11:13])):
#d[key] = sum(v for (d,v) in gen)
d[key] = avg(v for (d,v) in gen)
pprint.pprint(d)
Output:
{0: 148410.565,
1: 147042.743,
2: 143850.52000000002,
3: 142159.685,
4: 142944.15699999998}
Where the key of the dictionary ([0,1,2,3,4]) corresponds to the hour of the timestamp.
Input:
lst = [
['2015-03-14T00:00:00.000-04:00', 12545.869 ],
['2015-03-14T00:05:00.000-04:00', 12467.326],
['2015-03-14T00:10:00.000-04:00', 12416.948],
['2015-03-14T00:15:00.000-04:00', 12315.698],
['2015-03-14T00:20:00.000-04:00', 12276.38],
['2015-03-14T00:25:00.000-04:00', 12498.696],
['2015-03-14T00:30:00.000-04:00', 12426.145],
['2015-03-14T00:35:00.000-04:00', 12368.659],
['2015-03-14T00:40:00.000-04:00', 12322.785],
['2015-03-14T00:45:00.000-04:00', 12292.719],
['2015-03-14T00:50:00.000-04:00', 12257.965],
['2015-03-14T00:55:00.000-04:00', 12221.375],
['2015-03-14T01:00:00.000-04:00', 12393.725],
['2015-03-14T01:05:00.000-04:00', 12366.674],
['2015-03-14T01:10:00.000-04:00', 12378.578],
['2015-03-14T01:15:00.000-04:00', 12340.754],
['2015-03-14T01:20:00.000-04:00', 12288.511],
['2015-03-14T01:25:00.000-04:00', 12266.136],
['2015-03-14T01:30:00.000-04:00', 12236.639],
['2015-03-14T01:35:00.000-04:00', 12181.668],
['2015-03-14T01:40:00.000-04:00', 12171.992],
['2015-03-14T01:45:00.000-04:00', 12164.298],
['2015-03-14T01:50:00.000-04:00', 12137.282],
['2015-03-14T01:55:00.000-04:00', 12116.486],
['2015-03-14T02:00:02.000-04:00', 12090.439],
['2015-03-14T02:05:00.000-04:00', 12085.924],
['2015-03-14T02:10:00.000-04:00', 12034.78],
['2015-03-14T02:15:00.000-04:00', 12037.367],
['2015-03-14T02:20:00.000-04:00', 12006.649],
['2015-03-14T02:25:00.000-04:00', 11985.588],
['2015-03-14T02:30:00.000-04:00', 11999.41],
['2015-03-14T02:35:00.000-04:00', 11943.121],
['2015-03-14T02:40:00.000-04:00', 11934.346],
['2015-03-14T02:45:00.000-04:00', 11928.568],
['2015-03-14T02:50:00.000-04:00', 11918.63],
['2015-03-14T02:55:00.000-04:00', 11885.698],
['2015-03-14T03:00:00.000-04:00', 11863.065],
['2015-03-14T03:05:00.000-04:00', 11883.256],
['2015-03-14T03:10:00.000-04:00', 11870.095],
['2015-03-14T03:15:00.000-04:00', 11849.104],
['2015-03-14T03:20:00.000-04:00', 11849.18],
['2015-03-14T03:25:00.000-04:00', 11834.229],
['2015-03-14T03:30:00.000-04:00', 11826.603],
['2015-03-14T03:35:00.000-04:00', 11823.516],
['2015-03-14T03:40:00.000-04:00', 11849.386],
['2015-03-14T03:45:00.000-04:00', 11832.385],
['2015-03-14T03:50:00.000-04:00', 11847.059],
['2015-03-14T03:55:00.000-04:00', 11831.807],
['2015-03-14T04:00:00.000-04:00', 11844.027],
['2015-03-14T04:05:00.000-04:00', 11873.114],
['2015-03-14T04:10:00.000-04:00', 11904.105],
['2015-03-14T04:15:00.000-04:00', 11879.018],
['2015-03-14T04:20:00.000-04:00', 11899.658],
['2015-03-14T04:25:00.000-04:00', 11887.808],
['2015-03-14T04:30:00.000-04:00', 11879.875],
['2015-03-14T04:35:00.000-04:00', 11924.149],
['2015-03-14T04:40:00.000-04:00', 11929.499],
['2015-03-14T04:45:00.000-04:00', 11932.086],
['2015-03-14T04:50:00.000-04:00', 11989.847],
['2015-03-14T04:55:00.000-04:00', 12000.971],
]
Edit: per discussion in comments, what about:
import itertools, pprint
def avg(gf):
_sum = 0
for (i,e) in enumerate(gf): _sum += e
return float(_sum) / (i+1)
d = {}
for (key,gen) in itertools.groupby(lst, key=lambda l: int(l[0][11:13])):
vals = list(gen) # Unpack generator
key = vals[0][0][:13]
d[key] = avg(v for (d,v) in vals)
pprint.pprint(d)
You can do this pretty easily using a variety of tools, but I'll use a simple loop for simplicity sake:
>>> with open("listfile.txt", "r") as e:
>>> list_ = e.read().splitlines()
>>> list_ = list_[1:] # Grab all but the first line
>>>
>>> dateValue = dict()
>>> for row in list_:
>>> date, value - row.split()
>>> if ":00:" in date:
>>> # Start new value
>>> amount = int(value)
>>>
>>> elif ":55:" in date:
>>> # End new value
>>> date = date.split(':') # Grab only date and hour info
>>> dateValue[date] = amount / 12. # Returns a float, remove the period to return an integer
>>> del amount # Just in case the data isn't uniform, so it raises an error
>>>
>>> else:
>>> date += int(value)
If you want to export it to lists, just do:
>>> listDate = list()
>>> listAmount = list()
>>> for k in sorted(dateValue.keys() ):
>>> v = dateValue.get(k)
>>>
>>> listDate.append(k)
>>> listAmount.append(v)
quick and dirty way
reads= [
'2015-03-14T00:00:00.000-04:00 12545.869',
'2015-03-14T00:05:00.000-04:00 12467.326',
'2015-03-14T00:10:00.000-04:00 12416.948',
'2015-03-14T00:15:00.000-04:00 12315.698',
'2015-03-14T00:20:00.000-04:00 12276.38',
'2015-03-14T00:25:00.000-04:00 12498.696',
'2015-03-14T00:30:00.000-04:00 12426.145',
'2015-03-14T00:35:00.000-04:00 12368.659',
'2015-03-14T00:40:00.000-04:00 12322.785',
'2015-03-14T00:45:00.000-04:00 12292.719',
'2015-03-14T00:50:00.000-04:00 12257.965',
'2015-03-14T00:55:00.000-04:00 12221.375',
'2015-03-14T01:00:00.000-04:00 12393.725',
'2015-03-14T01:05:00.000-04:00 12366.674',
'2015-03-14T01:10:00.000-04:00 12378.578',
'2015-03-14T01:15:00.000-04:00 12340.754',
'2015-03-14T01:20:00.000-04:00 12288.511',
'2015-03-14T01:25:00.000-04:00 12266.136',
'2015-03-14T01:30:00.000-04:00 12236.639',
'2015-03-14T01:35:00.000-04:00 12181.668',
'2015-03-14T01:40:00.000-04:00 12171.992',
'2015-03-14T01:45:00.000-04:00 12164.298',
'2015-03-14T01:50:00.000-04:00 12137.282',
'2015-03-14T01:55:00.000-04:00 12116.486'
]
sums = {}
for read in reads:
hour = read.split(':')[0]
value = float(read.split().pop())
if hour in sums:
sums[hour] += value
else:
sums[hour] = value
avg = {}
for s in sums:
avg[s] = sums[s]/12
print avg