Convert the results of a dictironary to a dataframe - python

From this commands
from stackapi import StackAPI
lst = ['11786778','12370060']
df = pd.DataFrame(lst)
SITE = StackAPI('stackoverflow', key="xxxx")
results = []
for i in range(1,len(df)):
SITE.max_pages=10000000
SITE.page_size=100
post = SITE.fetch('/users/{ids}/reputation-history', ids=lst[i])
results.append(post)
The results variable prints the results of the json format
How is it possible to converts the results variable to a dataframe with five columns?
reputation_history_type, reputation_change, post_id, creation_date,
user_id

Here try this :
from stackapi import StackAPI
import pandas as pd
lst = ['11786778','12370060']
SITE = StackAPI('stackoverflow')
results = []
SITE.max_pages=10000000
SITE.page_size=100
for i in lst:
post = SITE.fetch('/users/{ids}/reputation-history', ids=[i]).get('items')
results.extend([list(j.values()) for j in post])
df = pd.DataFrame(results, columns = ['reputation_history_type', 'reputation_change', 'post_id', 'creation_date', 'user_id'])
Output :
print(df.head()) gives :
reputation_history_type reputation_change post_id creation_date user_id
0 asker_accepts_answer 2 59126012 1575207944 11786778.0
1 post_undownvoted 2 59118819 1575139301 11786778.0
2 post_upvoted 10 59118819 1575139301 11786778.0
3 post_downvoted -2 59118819 1575139299 11786778.0
4 post_upvoted 10 59110166 1575094452 11786778.0
print(df.tail()) gives :
reputation_history_type reputation_change post_id creation_date user_id
170 post_upvoted 10 58906292 1574036540 12370060.0
171 answer_accepted 15 58896536 1573990105 12370060.0
172 post_upvoted 10 58896044 1573972834 12370060.0
173 post_downvoted 0 58896299 1573948372 12370060.0
174 post_downvoted 0 58896158 1573947435 12370060.0
NOTE :
You can just create a dataframe direct from the result which will be list of lists.
You don't need to declare SITE.max_page and SIZE.page_size every time you loop through the lst.

from stackapi import StackAPI
import pandas as pd
lst = ['11786778', '12370060']
df = pd.DataFrame(lst)
SITE = StackAPI('stackoverflow', key="xxxx")
results = []
for i in range(1, len(df)):
SITE.max_pages = 10000000
SITE.page_size = 100
post = SITE.fetch('/users/{ids}/reputation-history', ids=lst[i])
results.append(post)
data = []
for item in results:
data.append(item)
df = pd.DataFrame(data, columns=['reputation_history_type', 'reputation_change', 'post_id', 'creation_date', 'user_id']
print(df)

Kinda flying in the blind since I maxed out my StackOverflow API limit, but this should work:
from stackapi import StackAPI
from pandas.io.json import json_normalize
lst = ['11786778','12370060']
SITE = StackAPI('stackoverflow', key="xxx")
results = []
for ids in lst:
SITE.max_pages=10000000
SITE.page_size=100
post = SITE.fetch('/users/{ids}/reputation-history', ids=ids)
results.append(json_normalize(post, 'items'))
df = pd.concat(results, ignore_index=True)
json_normalize converts the JSON to dataframe
pd.concat concatenates the dataframes together to make a single frame

Related

Cannot Convert list of list using pandas

This is my Data from the api from the looks of it, it is a list of list.
with ApiClient(configuration) as api_client:
api_instance = MetricsApi(api_client)
response = api_instance.query_metrics(
_from=int(yesterday_start_dt.timestamp()),
to=int(yesterday_end_dt.timestamp()),
query="default_zero(sum:trace.servlet.request.hits{env:prd-main,service:api}.as_rate())",
)
result = response['series'][0]['pointlist']
print(result)
[[1648339200000.0, 1105.8433333333332], [1648339500000.0, 1093.3266666666666], [1648339800000.0, 1076.92], [1648340100000.0, 1059.5133333333333], [1648340400000.0, 1053.8966666666668], [1648340700000.0, 1041.2166666666667], [1648341000000.0, 1055.0533333333333], [1648341300000.0, 1037.8933333333334], [1648341600000.0, 1015.4], [1648341900000.0, 1003.3233333333334], [1648342200000.0, 1017.02], [1648342500000.0, 1017.7766666666666], [1648342800000.0, 1011.0333333333333], [1648343100000.0, 993.9366666666666], [1648343400000.0, 973.9733333333334], [1648343700000.0, 967.8433333333334], [1648344000000.0, 933.2166666666667], [1648344300000.0, 945.0833333333334], [1648344600000.0, 905.2166666666667], [1648344900000.0, 923.9966666666667], [1648345200000.0, 925.4633333333334], [1648345500000.0, 915.5533333333333], [1648345800000.0, 918.8966666666666], [1648346100000.0, 883.6], [1648346400000.0, 908.9166666666666], [1648346700000.0, 856.7333333333333], [1648347000000.0, 873.01], [1648347300000.0, 833.99], [1648347600000.0, 846.5466666666666], [1648347900000.0, 820.7833333333333], [1648348200000.0, 821.4633333333334], [1648348500000.0, 812.8633333333333], [1648348800000.0, 817.78], [1648349100000.0, 821.91], [1648349400000.0, 791.17], [1648349700000.0, 780.3066666666666], [1648350000000.0, 803.4633333333334], [1648350300000.0, 781.9033333333333], [1648350600000.0, 759.4933333333333], [1648350900000.0, 746.11], [1648351200000.0, 731.3133333333334], [1648351500000.0, 724.0533333333333], [1648351800000.0, 710.56], [1648352100000.0, 722.87], [1648352400000.0, 677.5266666666666], [1648352700000.0, 681.7833333333333], [1648353000000.0, 679.9233333333333], [1648353300000.0, 650.6466666666666], [1648353600000.0, 663.78], [1648353900000.0, 650.8133333333334], [1648354200000.0, 645.9133333333333], [1648354500000.0, 642.4566666666667], [1648354800000.0, 627.93], [1648355100000.0, 616.65], [1648355400000.0, 609.94], [1648355700000.0, 602.0733333333334], [1648356000000.0, 581.6133333333333], [1648356300000.0, 592.48], [1648356600000.0, 593.4], [1648356900000.0, 582.2633333333333], [1648357200000.0, 598.3766666666667], [1648357500000.0, 589.99], [1648357800000.0, 577.7433333333333], [1648358100000.0, 570.1733333333333], [1648358400000.0, 592.58], [1648358700000.0, 578.2533333333333], [1648359000000.0, 586.8833333333333], [1648359300000.0, 590.4033333333333], [1648359600000.0, 601.49], [1648359900000.0, 594.8], [1648360200000.0, 609.01], [1648360500000.0, 620.08], [1648360800000.0, 642.6466666666666], [1648361100000.0, 635.93], [1648361400000.0, 638.42], [1648361700000.0, 645.2], [1648362000000.0, 650.42], [1648362300000.0, 667.88], [1648362600000.0, 689.3666666666667], [1648362900000.0, 694.4433333333334], [1648363200000.0, 690.3933333333333], [1648363500000.0, 710.55], [1648363800000.0, 706.3], [1648364100000.0, 729.5], [1648364400000.0, 771.36], [1648364700000.0, 754.03], [1648365000000.0, 771.4866666666667], [1648365300000.0, 767.52], [1648365600000.0, 779.4133333333333], [1648365900000.0, 800.4266666666666], [1648366200000.0, 788.41], [1648366500000.0, 806.8666666666667], [1648366800000.0, 805.7466666666667], [1648367100000.0, 815.2433333333333], [1648367400000.0, 828.0833333333334], [1648367700000.0, 817.1966666666667], [1648368000000.0, 879.4733333333334], [1648368300000.0, 840.7933333333333], [1648368600000.0, 846.4266666666666], [1648368900000.0, 848.1266666666667], [1648369200000.0, 836.9066666666666], [1648369500000.0, 845.4966666666667], [1648369800000.0, 863.5033333333333], [1648370100000.0, 867.1866666666666], [1648370400000.0, 866.74], [1648370700000.0, 863.8066666666666], [1648371000000.0, 882.38], [1648371300000.0, 876.0233333333333], [1648371600000.0, 905.3366666666667], [1648371900000.0, 879.8066666666666], [1648372200000.0, 878.37], [1648372500000.0, 876.9333333333333], [1648372800000.0, 868.1533333333333], [1648373100000.0, 882.12], [1648373400000.0, 896.9233333333333], [1648373700000.0, 872.84], [1648374000000.0, 880.71], [1648374300000.0, 894.8066666666666], [1648374600000.0, 873.7266666666667], [1648374900000.0, 891.0033333333333], [1648375200000.0, 927.2433333333333], [1648375500000.0, 905.52], [1648375800000.0, 895.0233333333333], [1648376100000.0, 895.86], [1648376400000.0, 899.3133333333334], [1648376700000.0, 920.22], [1648377000000.0, 937.68], [1648377300000.0, 916.46], [1648377600000.0, 926.6833333333333], [1648377900000.0, 936.4366666666666], [1648378200000.0, 947.6133333333333], [1648378500000.0, 957.7133333333334], [1648378800000.0, 989.1133333333333], [1648379100000.0, 959.0766666666667], [1648379400000.0, 963.5133333333333], [1648379700000.0, 978.3466666666667], [1648380000000.0, 1017.78], [1648380300000.0, 989.7566666666667], [1648380600000.0, 1023.4633333333334], [1648380900000.0, 1033.7166666666667], [1648381200000.0, 1025.1933333333334], [1648381500000.0, 1045.8633333333332], [1648381800000.0, 1063.6133333333332], [1648382100000.0, 1078.45], [1648382400000.0, 1116.3866666666668], [1648382700000.0, 1098.9766666666667], [1648383000000.0, 1101.29], [1648383300000.0, 1127.6], [1648383600000.0, 1102.5233333333333], [1648383900000.0, 1140.84], [1648384200000.0, 1169.23], [1648384500000.0, 1158.6], [1648384800000.0, 1180.01], [1648385100000.0, 1190.43], [1648385400000.0, 1207.3733333333332], [1648385700000.0, 1212.7666666666667], [1648386000000.0, 1244.17], [1648386300000.0, 1245.3166666666666], [1648386600000.0, 1240.69], [1648386900000.0, 1270.33], [1648387200000.0, 1277.8033333333333], [1648387500000.0, 1270.5966666666666], [1648387800000.0, 1304.4266666666667], [1648388100000.0, 1295.6933333333334], [1648388400000.0, 1322.3066666666666], [1648388700000.0, 1351.41], [1648389000000.0, 1339.9566666666667], [1648389300000.0, 1353.2966666666666], [1648389600000.0, 1398.45], [1648389900000.0, 1378.21], [1648390200000.0, 1361.0933333333332], [1648390500000.0, 1404.0833333333333], [1648390800000.0, 1394.6466666666668], [1648391100000.0, 1391.1366666666668], [1648391400000.0, 1450.0], [1648391700000.0, 1438.97], [1648392000000.0, 1411.83], [1648392300000.0, 1432.8233333333333], [1648392600000.0, 1473.3966666666668], [1648392900000.0, 1491.0166666666667], [1648393200000.0, 1509.8766666666668], [1648393500000.0, 1488.6566666666668], [1648393800000.0, 1488.4933333333333], [1648394100000.0, 1511.4466666666667], [1648394400000.0, 1508.3566666666666], [1648394700000.0, 1507.8966666666668], [1648395000000.0, 1515.8633333333332], [1648395300000.0, 1517.3], [1648395600000.0, 1528.81], [1648395900000.0, 1546.1266666666668], [1648396200000.0, 1554.57], [1648396500000.0, 1584.0333333333333], [1648396800000.0, 1584.45], [1648397100000.0, 1590.4633333333334], [1648397400000.0, 1580.0066666666667], [1648397700000.0, 1596.3833333333334], [1648398000000.0, 1571.96], [1648398300000.0, 1583.8233333333333], [1648398600000.0, 1618.7033333333334], [1648398900000.0, 1588.12], [1648399200000.0, 1599.56], [1648399500000.0, 1604.1833333333334], [1648399800000.0, 1621.5666666666666], [1648400100000.0, 1598.98], [1648400400000.0, 1627.02], [1648400700000.0, 1612.7833333333333], [1648401000000.0, 1612.2433333333333], [1648401300000.0, 1572.89], [1648401600000.0, 1601.8933333333334], [1648401900000.0, 1612.5366666666666], [1648402200000.0, 1608.7266666666667], [1648402500000.0, 1594.4366666666667], [1648402800000.0, 1614.3366666666666], [1648403100000.0, 1649.0733333333333], [1648403400000.0, 1627.12], [1648403700000.0, 1644.9633333333334], [1648404000000.0, 1653.9033333333334], [1648404300000.0, 1636.6966666666667], [1648404600000.0, 1639.5733333333333], [1648404900000.0, 1627.3866666666668], [1648405200000.0, 1626.3733333333332], [1648405500000.0, 1616.7966666666666], [1648405800000.0, 1667.2933333333333], [1648406100000.0, 1637.0733333333333], [1648406400000.0, 1654.6366666666668], [1648406700000.0, 1673.9566666666667], [1648407000000.0, 1658.4466666666667], [1648407300000.0, 1650.6766666666667], [1648407600000.0, 1662.1933333333334], [1648407900000.0, 1686.9733333333334], [1648408200000.0, 1623.0433333333333], [1648408500000.0, 1630.2866666666666], [1648408800000.0, 1599.0466666666666], [1648409100000.0, 1624.8033333333333], [1648409400000.0, 1606.0333333333333], [1648409700000.0, 1594.15], [1648410000000.0, 1557.1333333333334], [1648410300000.0, 1630.6133333333332], [1648410600000.0, 1591.93], [1648410900000.0, 1579.5733333333333], [1648411200000.0, 1585.1466666666668], [1648411500000.0, 1565.6166666666666], [1648411800000.0, 1566.3366666666666], [1648412100000.0, 1544.1866666666667], [1648412400000.0, 1511.8166666666666], [1648412700000.0, 1525.2333333333333], [1648413000000.0, 1505.57], [1648413300000.0, 1462.9033333333334], [1648413600000.0, 1478.0733333333333], [1648413900000.0, 1460.76], [1648414200000.0, 1504.59], [1648414500000.0, 1460.3366666666666], [1648414800000.0, 1445.9366666666667], [1648415100000.0, 1410.0033333333333], [1648415400000.0, 1412.8466666666666], [1648415700000.0, 1364.8933333333334], [1648416000000.0, 1348.4], [1648416300000.0, 1338.3333333333333], [1648416600000.0, 1326.8633333333332], [1648416900000.0, 1276.24], [1648417200000.0, 1310.0333333333333], [1648417500000.0, 1285.63], [1648417800000.0, 1244.14], [1648418100000.0, 1258.38], [1648418400000.0, 1218.37], [1648418700000.0, 1182.0266666666666], [1648419000000.0, 1196.8133333333333], [1648419300000.0, 1144.54], [1648419600000.0, 1165.62], [1648419900000.0, 1122.0166666666667], [1648420200000.0, 1112.6766666666667], [1648420500000.0, 1102.6], [1648420800000.0, 1095.6966666666667], [1648421100000.0, 1056.63], [1648421400000.0, 1074.5066666666667], [1648421700000.0, 1047.5933333333332], [1648422000000.0, 1057.2633333333333], [1648422300000.0, 1043.99], [1648422600000.0, 1003.4033333333333], [1648422900000.0, 1022.2633333333333], [1648423200000.0, 1016.59], [1648423500000.0, 997.4466666666667], [1648423800000.0, 988.7666666666667], [1648424100000.0, 966.1666666666666], [1648424400000.0, 991.21], [1648424700000.0, 977.6633333333333], [1648425000000.0, 959.64], [1648425300000.0, 961.6989966555184]]
But when i try to convert into pandas it convert but the result is what i expected, i expected that the dataframe will make two columns.
with ApiClient(configuration) as api_client:
api_instance = MetricsApi(api_client)
response = api_instance.query_metrics(
_from=int(yesterday_start_dt.timestamp()),
to=int(yesterday_end_dt.timestamp()),
query="default_zero(sum:trace.servlet.request.hits{env:prd-main,service:api}.as_rate())",
)
result = response['series'][0]['pointlist']
df = pd.DataFrame(result)
print(df)
0
0 [1648339200000.0, 1105.8433333333332]
1 [1648339500000.0, 1093.3266666666666]
2 [1648339800000.0, 1076.92]
3 [1648340100000.0, 1059.5133333333333]
4 [1648340400000.0, 1053.8966666666668]
.. ...
283 [1648424100000.0, 966.1666666666666]
284 [1648424400000.0, 991.21]
285 [1648424700000.0, 977.6633333333333]
286 [1648425000000.0, 959.64]
287 [1648425300000.0, 961.6989966555184]
[288 rows x 1 columns]
As I pointed out in the comment referring to another answer, here how you may do it.
columns = ['point 1', 'point 2']
result = response['series'][0]['pointlist']
df = pd.DataFrame(result,columns=columns)
If you want the first elements of sublists to be in one column, I suggest you create an intermediate np.array and then reshape it into the needed output.
array_results=np.array(results)
df=pd.DataFrame(array_results.reshape(2, len(results)))
following few samples of 'results', here are my outputs

How to optimize PRAW and pandas data collection to make it more pythonic?

I am using PRAW to get data from Reddit and created this function to do so on multiple subreddits.
It works, however, I am working on a more concise/pythonic version but can't figure out how I can create a single "for loop", doing the job of the 3 below.
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
posts = []
followers = []
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S')
#getting x new posts
for subreddit in subs.new(limit = size_new):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x hot posts
for subreddit in subs.hot(limit = size_hot):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x rising posts
for subreddit in subs.rising(limit = size_rising):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
My request: how could it be more concise/optimized? What methodology are you using to make your code more readable or even better, optimize it for run time/computational resources?
Thank you
There are various principles to make better code, and various tools to use to find the 'code smells' that may be lurking in your code.
DRY - Don't Repeat Yourself
KISS - keep it stupid simple
SOLID
etc...
Taking a dive into the code that you posted using some of the principles on a surface level would refactor some of your code into looking like:
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
# check that the date is greater than the target date
# return true/false
def check_date(subreddit, targeted_date):
return subreddit.created >= targeted_date:
# get specific post data
def get_post_data(subreddit):
return [subreddit.created, subreddit.subreddit, subreddit.title, subreddit.selftext]
# get posts by sort type
def get_subreddit_post_types(subreddit_sort, targeted_date):
return [get_post_data(subreddit) for subreddit in subreddit_sort if check_date(subreddit, targeted_date)]
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S').timestamp()
posts = []
followers = []
#getting x new posts
posts.extend(get_subreddit_post_types(subs.new(limit = size_new), targeted_date))
#getting x hot posts
posts.extend(get_subreddit_post_types(subs.hot(limit = size_hot), targeted_date))
#getting x rising posts
posts.extend(get_subreddit_post_types(subs.rising(limit = size_rising), targeted_date))
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
As for better optimizing your computational resources (what are you trying to optimize memory or runtime)? The same process applies to either is to look at your code to see what can be changed to decrease one versus the other.
From looking at your code something that would generally optimize what you wrote would be to look at what are the 'duplicate' posts that you are getting. If you could remove the duplicate check (as each of the hot/rising/new get posts from similar date ranges, but hot/rising may be completely encompassed inside of new) call from the posts that you gathered, so that you don't have to check that they are different, and possibly remove hot/rising calls (because those posts may be encompassed in new).

Save geocoding results from address to longitude and latitude to original dataframe in Python

Given a small dataset df as follows:
id name address
0 1 ABC tower 北京市朝阳区
1 2 AC park 北京市海淀区
2 3 ZR hospital 上海市黄浦区
3 4 Fengtai library NaN
4 5 Square Point 上海市虹口区
I would like to obtain longitude and latidude for address column and append them to orginal dataframe. Please note there are NaNs in address column.
The code below gives me a table with addresses, longitude and latitude, but it ignores the NaN address rows, also the code should be improved:
import pandas as pd
import requests
import json
df = df[df['address'].notna()]
res = []
for addre in df['address']:
url = "http://restapi.amap.com/v3/geocode/geo?key=f057101329c0200f170be166d9b023a1&address=" + addre
dat = {
'count': "1",
}
r = requests.post(url, data = json.dumps(dat))
s = r.json()
infos = s['geocodes']
for j in range(0, 10000):
# print(j)
try:
more_infos = infos[j]
# print(more_infos)
except:
continue
try:
data = more_infos['location']
# print(data)
except:
continue
try:
lon_lat = data.split(',')
lon = float(lon_lat[0])
lat = float(lon_lat[1])
except:
continue
res.append([addre, lon, lat])
result = pd.DataFrame(res)
result.columns = ['address', 'longitude', 'latitude']
print(result)
result.to_excel('result.xlsx', index = False)
Out:
address longitude latitude
0 北京市朝阳区 116.601144 39.948574
1 北京市海淀区 116.329519 39.972134
2 上海市黄浦区 121.469240 31.229860
3 上海市虹口区 121.505133 31.264600
But how could I get the final result as follows? Thanks for your kind help at advance.
id name address longitude latitude
0 1 ABC tower 北京市朝阳区 116.601144 39.948574
1 2 AC park 北京市海淀区 116.329519 39.972134
2 3 ZR hospital 上海市黄浦区 121.469240 31.229860
3 4 Fengtai library NaN NaN NaN
4 5 Square Point 上海市虹口区 121.505133 31.264600
use pd.merge, as result is the longitude & latitude dataframe.
dfn = pd.merge(df, result, on='address', how='left')
or
for _, row in df.iterrows():
_id = row['id']
name = row['name']
addre = row['address']
if pd.isna(row['address']):
res.append([_id, name, addre, None, None])
continue
###### same code ######
url = '...'
# ...
###### same code ######
res.append([_id, name, addre, lon, lat])
result = pd.DataFrame(res)
result.columns = ['id', 'name', 'address', 'longitude', 'latitude']
print(result)
result.to_excel('result.xlsx', index = False)

How may I append new results from iterating through a list, into a new column in the dataframe

Im attempting to create a table as follows, where equities in a list get appended as columns to the dataframe:
Fundamentals CTRP EBAY ...... MPNGF
price
dividend
five_year_dividend
pe_ratio
pegRatio
priceToBook
price_to_sales
book_value
ebit
net_income
EPS
DebtEquity
threeYearAverageReturn
At the moment, based on the code below, only the last equity in the list is showing up:
Fundamentals MPNGF
price
dividend
five_year_dividend
pe_ratio
pegRatio
priceToBook
price_to_sales
book_value
ebit
net_income
EPS
DebtEquity
threeYearAverageReturn
from yahoofinancials import YahooFinancials
import pandas as pd
import lxml
from lxml import html
import requests
import numpy as np
from datetime import datetime
def scrape_table(url):
page = requests.get(url)
tree = html.fromstring(page.content)
table = tree.xpath('//table')
assert len(table) == 1
df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0]
df = df.set_index(0)
df = df.dropna()
df = df.transpose()
df = df.replace('-', '0')
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]])
cols = list(df.columns)
cols[0] = 'Date'
df = df.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df.columns)[1::]
df[numeric_columns] = df[numeric_columns].astype(np.float64)
return df
ecommerce = ['CTRP', 'EBAY', 'GRUB', 'BABA', 'JD', 'EXPE', 'AMZN', 'BKNG', 'MPNGF']
price=[]
dividend=[]
five_year_dividend=[]
pe_ratio=[]
pegRatio=[]
priceToBook=[]
price_to_sales=[]
book_value=[]
ebit=[]
net_income=[]
EPS=[]
DebtEquity=[]
threeYearAverageReturn=[]
for i, symbol in enumerate(ecommerce):
yahoo_financials = YahooFinancials(symbol)
balance_sheet_url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
df_balance_sheet = scrape_table(balance_sheet_url)
df_balance_sheet_de = pd.DataFrame(df_balance_sheet, columns = ["Total Liabilities", "Total stockholders' equity"])
j= df_balance_sheet_de.loc[[1]]
j['DebtEquity'] = j["Total Liabilities"]/j["Total stockholders' equity"]
k= j.iloc[0]['DebtEquity']
X = yahoo_financials.get_key_statistics_data()
for d in X.values():
PEG = d['pegRatio']
PB = d['priceToBook']
three_year_ave_return = d['threeYearAverageReturn']
data = [['price', yahoo_financials.get_current_price()], ['dividend', yahoo_financials.get_dividend_yield()], ['five_year_dividend', yahoo_financials.get_five_yr_avg_div_yield()], ['pe_ratio', yahoo_financials.get_pe_ratio()], ['pegRatio', PEG], ['priceToBook', PB], ['price_to_sales', yahoo_financials.get_price_to_sales()], ['book_value', yahoo_financials.get_book_value()], ['ebit', yahoo_financials.get_ebit()], ['net_income', yahoo_financials.get_net_income()], ['EPS', yahoo_financials.get_earnings_per_share()], ['DebtEquity', mee], ['threeYearAverageReturn', three_year_ave_return]]
data.append(symbol.text)
df = pd.DataFrame(data, columns = ['Fundamentals', symbol])
df
Seeking your kind advice please as to where may i have gone wrong in the above table? Thank you so very much!
You need to call your df outside of your for loop. Your code as currently written will recreate a new df for every loop.

Multiple variables loop and append dataframe

I am trying to loop over 2 lists to get all combinations possible in the loop below. I have some difficulties to understand why the first part works and the second does not. Basically it query the same data but with all pattern from the lists. Any help would be very appreciated.
THE CODE:
base = ['BTC', 'ETH']
quoted = ['USDT', 'AUD','USD']
def daily_volume_historical(symbol, comparison_symbol, all_data=False, limit=90, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
df.set_index('timestamp')
return df
## THIS CODE GIVES SOME DATA ##
volu = daily_volume_historical('BTC', 'USD', 'CCCAGG').set_index('timestamp').volumefrom
## THIS CODE GIVES EMPTY DATA FRAME ##
d_volu = []
for a,b in [(a,b) for a in base for b in quoted]:
volu = daily_volume_historical(a, b, exchange= 'CCCAGG').volumefrom
d_volu.append
d_volu = pd.concat(d_volu, axis=1)
volu output sample:
timestamp
2010-07-17 09:00:00 20.00
2010-07-18 09:00:00 75.01
2010-07-19 09:00:00 574.00
2010-07-20 09:00:00 262.00
2010-07-21 09:00:00 575.00
2010-07-22 09:00:00 2160.00
2010-07-23 09:00:00 2402.50
2010-07-24 09:00:00 496.32
import itertools
base = ['BTC', 'ETH']
quoted = ['USDT', 'AUD','USD']
combinations = list(itertools.product(base, quoted))
def daily_volume_historical(symbol, comparison_symbol, all_data=False, limit=90, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
df.set_index('timestamp')
return df
## THIS CODE GIVES SOME DATA ##
volu = daily_volume_historical('BTC', 'USD', 'CCCAGG').set_index('timestamp').volumefrom
## THIS CODE GIVES EMPTY DATA FRAME ##
d_volu = []
for a,b in combinations:
volu = daily_volume_historical(a, b, exchange= 'CCCAGG').volumefrom
d_volu.append
d_volu = pd.concat(d_volu, axis=1)

Categories

Resources