Convert the results of a dictironary to a dataframe - python
From this commands
from stackapi import StackAPI
lst = ['11786778','12370060']
df = pd.DataFrame(lst)
SITE = StackAPI('stackoverflow', key="xxxx")
results = []
for i in range(1,len(df)):
SITE.max_pages=10000000
SITE.page_size=100
post = SITE.fetch('/users/{ids}/reputation-history', ids=lst[i])
results.append(post)
The results variable prints the results of the json format
How is it possible to converts the results variable to a dataframe with five columns?
reputation_history_type, reputation_change, post_id, creation_date,
user_id
Here try this :
from stackapi import StackAPI
import pandas as pd
lst = ['11786778','12370060']
SITE = StackAPI('stackoverflow')
results = []
SITE.max_pages=10000000
SITE.page_size=100
for i in lst:
post = SITE.fetch('/users/{ids}/reputation-history', ids=[i]).get('items')
results.extend([list(j.values()) for j in post])
df = pd.DataFrame(results, columns = ['reputation_history_type', 'reputation_change', 'post_id', 'creation_date', 'user_id'])
Output :
print(df.head()) gives :
reputation_history_type reputation_change post_id creation_date user_id
0 asker_accepts_answer 2 59126012 1575207944 11786778.0
1 post_undownvoted 2 59118819 1575139301 11786778.0
2 post_upvoted 10 59118819 1575139301 11786778.0
3 post_downvoted -2 59118819 1575139299 11786778.0
4 post_upvoted 10 59110166 1575094452 11786778.0
print(df.tail()) gives :
reputation_history_type reputation_change post_id creation_date user_id
170 post_upvoted 10 58906292 1574036540 12370060.0
171 answer_accepted 15 58896536 1573990105 12370060.0
172 post_upvoted 10 58896044 1573972834 12370060.0
173 post_downvoted 0 58896299 1573948372 12370060.0
174 post_downvoted 0 58896158 1573947435 12370060.0
NOTE :
You can just create a dataframe direct from the result which will be list of lists.
You don't need to declare SITE.max_page and SIZE.page_size every time you loop through the lst.
from stackapi import StackAPI
import pandas as pd
lst = ['11786778', '12370060']
df = pd.DataFrame(lst)
SITE = StackAPI('stackoverflow', key="xxxx")
results = []
for i in range(1, len(df)):
SITE.max_pages = 10000000
SITE.page_size = 100
post = SITE.fetch('/users/{ids}/reputation-history', ids=lst[i])
results.append(post)
data = []
for item in results:
data.append(item)
df = pd.DataFrame(data, columns=['reputation_history_type', 'reputation_change', 'post_id', 'creation_date', 'user_id']
print(df)
Kinda flying in the blind since I maxed out my StackOverflow API limit, but this should work:
from stackapi import StackAPI
from pandas.io.json import json_normalize
lst = ['11786778','12370060']
SITE = StackAPI('stackoverflow', key="xxx")
results = []
for ids in lst:
SITE.max_pages=10000000
SITE.page_size=100
post = SITE.fetch('/users/{ids}/reputation-history', ids=ids)
results.append(json_normalize(post, 'items'))
df = pd.concat(results, ignore_index=True)
json_normalize converts the JSON to dataframe
pd.concat concatenates the dataframes together to make a single frame
Related
Cannot Convert list of list using pandas
This is my Data from the api from the looks of it, it is a list of list. with ApiClient(configuration) as api_client: api_instance = MetricsApi(api_client) response = api_instance.query_metrics( _from=int(yesterday_start_dt.timestamp()), to=int(yesterday_end_dt.timestamp()), query="default_zero(sum:trace.servlet.request.hits{env:prd-main,service:api}.as_rate())", ) result = response['series'][0]['pointlist'] print(result) [[1648339200000.0, 1105.8433333333332], [1648339500000.0, 1093.3266666666666], [1648339800000.0, 1076.92], [1648340100000.0, 1059.5133333333333], [1648340400000.0, 1053.8966666666668], [1648340700000.0, 1041.2166666666667], [1648341000000.0, 1055.0533333333333], [1648341300000.0, 1037.8933333333334], [1648341600000.0, 1015.4], [1648341900000.0, 1003.3233333333334], [1648342200000.0, 1017.02], [1648342500000.0, 1017.7766666666666], [1648342800000.0, 1011.0333333333333], [1648343100000.0, 993.9366666666666], [1648343400000.0, 973.9733333333334], [1648343700000.0, 967.8433333333334], [1648344000000.0, 933.2166666666667], [1648344300000.0, 945.0833333333334], [1648344600000.0, 905.2166666666667], [1648344900000.0, 923.9966666666667], [1648345200000.0, 925.4633333333334], [1648345500000.0, 915.5533333333333], [1648345800000.0, 918.8966666666666], [1648346100000.0, 883.6], [1648346400000.0, 908.9166666666666], [1648346700000.0, 856.7333333333333], [1648347000000.0, 873.01], [1648347300000.0, 833.99], [1648347600000.0, 846.5466666666666], [1648347900000.0, 820.7833333333333], [1648348200000.0, 821.4633333333334], [1648348500000.0, 812.8633333333333], [1648348800000.0, 817.78], [1648349100000.0, 821.91], [1648349400000.0, 791.17], [1648349700000.0, 780.3066666666666], [1648350000000.0, 803.4633333333334], [1648350300000.0, 781.9033333333333], [1648350600000.0, 759.4933333333333], [1648350900000.0, 746.11], [1648351200000.0, 731.3133333333334], [1648351500000.0, 724.0533333333333], [1648351800000.0, 710.56], [1648352100000.0, 722.87], [1648352400000.0, 677.5266666666666], [1648352700000.0, 681.7833333333333], [1648353000000.0, 679.9233333333333], [1648353300000.0, 650.6466666666666], [1648353600000.0, 663.78], [1648353900000.0, 650.8133333333334], [1648354200000.0, 645.9133333333333], [1648354500000.0, 642.4566666666667], [1648354800000.0, 627.93], [1648355100000.0, 616.65], [1648355400000.0, 609.94], [1648355700000.0, 602.0733333333334], [1648356000000.0, 581.6133333333333], [1648356300000.0, 592.48], [1648356600000.0, 593.4], [1648356900000.0, 582.2633333333333], [1648357200000.0, 598.3766666666667], [1648357500000.0, 589.99], [1648357800000.0, 577.7433333333333], [1648358100000.0, 570.1733333333333], [1648358400000.0, 592.58], [1648358700000.0, 578.2533333333333], [1648359000000.0, 586.8833333333333], [1648359300000.0, 590.4033333333333], [1648359600000.0, 601.49], [1648359900000.0, 594.8], [1648360200000.0, 609.01], [1648360500000.0, 620.08], [1648360800000.0, 642.6466666666666], [1648361100000.0, 635.93], [1648361400000.0, 638.42], [1648361700000.0, 645.2], [1648362000000.0, 650.42], [1648362300000.0, 667.88], [1648362600000.0, 689.3666666666667], [1648362900000.0, 694.4433333333334], [1648363200000.0, 690.3933333333333], [1648363500000.0, 710.55], [1648363800000.0, 706.3], [1648364100000.0, 729.5], [1648364400000.0, 771.36], [1648364700000.0, 754.03], [1648365000000.0, 771.4866666666667], [1648365300000.0, 767.52], [1648365600000.0, 779.4133333333333], [1648365900000.0, 800.4266666666666], [1648366200000.0, 788.41], [1648366500000.0, 806.8666666666667], [1648366800000.0, 805.7466666666667], [1648367100000.0, 815.2433333333333], [1648367400000.0, 828.0833333333334], [1648367700000.0, 817.1966666666667], [1648368000000.0, 879.4733333333334], [1648368300000.0, 840.7933333333333], [1648368600000.0, 846.4266666666666], [1648368900000.0, 848.1266666666667], [1648369200000.0, 836.9066666666666], [1648369500000.0, 845.4966666666667], [1648369800000.0, 863.5033333333333], [1648370100000.0, 867.1866666666666], [1648370400000.0, 866.74], [1648370700000.0, 863.8066666666666], [1648371000000.0, 882.38], [1648371300000.0, 876.0233333333333], [1648371600000.0, 905.3366666666667], [1648371900000.0, 879.8066666666666], [1648372200000.0, 878.37], [1648372500000.0, 876.9333333333333], [1648372800000.0, 868.1533333333333], [1648373100000.0, 882.12], [1648373400000.0, 896.9233333333333], [1648373700000.0, 872.84], [1648374000000.0, 880.71], [1648374300000.0, 894.8066666666666], [1648374600000.0, 873.7266666666667], [1648374900000.0, 891.0033333333333], [1648375200000.0, 927.2433333333333], [1648375500000.0, 905.52], [1648375800000.0, 895.0233333333333], [1648376100000.0, 895.86], [1648376400000.0, 899.3133333333334], [1648376700000.0, 920.22], [1648377000000.0, 937.68], [1648377300000.0, 916.46], [1648377600000.0, 926.6833333333333], [1648377900000.0, 936.4366666666666], [1648378200000.0, 947.6133333333333], [1648378500000.0, 957.7133333333334], [1648378800000.0, 989.1133333333333], [1648379100000.0, 959.0766666666667], [1648379400000.0, 963.5133333333333], [1648379700000.0, 978.3466666666667], [1648380000000.0, 1017.78], [1648380300000.0, 989.7566666666667], [1648380600000.0, 1023.4633333333334], [1648380900000.0, 1033.7166666666667], [1648381200000.0, 1025.1933333333334], [1648381500000.0, 1045.8633333333332], [1648381800000.0, 1063.6133333333332], [1648382100000.0, 1078.45], [1648382400000.0, 1116.3866666666668], [1648382700000.0, 1098.9766666666667], [1648383000000.0, 1101.29], [1648383300000.0, 1127.6], [1648383600000.0, 1102.5233333333333], [1648383900000.0, 1140.84], [1648384200000.0, 1169.23], [1648384500000.0, 1158.6], [1648384800000.0, 1180.01], [1648385100000.0, 1190.43], [1648385400000.0, 1207.3733333333332], [1648385700000.0, 1212.7666666666667], [1648386000000.0, 1244.17], [1648386300000.0, 1245.3166666666666], [1648386600000.0, 1240.69], [1648386900000.0, 1270.33], [1648387200000.0, 1277.8033333333333], [1648387500000.0, 1270.5966666666666], [1648387800000.0, 1304.4266666666667], [1648388100000.0, 1295.6933333333334], [1648388400000.0, 1322.3066666666666], [1648388700000.0, 1351.41], [1648389000000.0, 1339.9566666666667], [1648389300000.0, 1353.2966666666666], [1648389600000.0, 1398.45], [1648389900000.0, 1378.21], [1648390200000.0, 1361.0933333333332], [1648390500000.0, 1404.0833333333333], [1648390800000.0, 1394.6466666666668], [1648391100000.0, 1391.1366666666668], [1648391400000.0, 1450.0], [1648391700000.0, 1438.97], [1648392000000.0, 1411.83], [1648392300000.0, 1432.8233333333333], [1648392600000.0, 1473.3966666666668], [1648392900000.0, 1491.0166666666667], [1648393200000.0, 1509.8766666666668], [1648393500000.0, 1488.6566666666668], [1648393800000.0, 1488.4933333333333], [1648394100000.0, 1511.4466666666667], [1648394400000.0, 1508.3566666666666], [1648394700000.0, 1507.8966666666668], [1648395000000.0, 1515.8633333333332], [1648395300000.0, 1517.3], [1648395600000.0, 1528.81], [1648395900000.0, 1546.1266666666668], [1648396200000.0, 1554.57], [1648396500000.0, 1584.0333333333333], [1648396800000.0, 1584.45], [1648397100000.0, 1590.4633333333334], [1648397400000.0, 1580.0066666666667], [1648397700000.0, 1596.3833333333334], [1648398000000.0, 1571.96], [1648398300000.0, 1583.8233333333333], [1648398600000.0, 1618.7033333333334], [1648398900000.0, 1588.12], [1648399200000.0, 1599.56], [1648399500000.0, 1604.1833333333334], [1648399800000.0, 1621.5666666666666], [1648400100000.0, 1598.98], [1648400400000.0, 1627.02], [1648400700000.0, 1612.7833333333333], [1648401000000.0, 1612.2433333333333], [1648401300000.0, 1572.89], [1648401600000.0, 1601.8933333333334], [1648401900000.0, 1612.5366666666666], [1648402200000.0, 1608.7266666666667], [1648402500000.0, 1594.4366666666667], [1648402800000.0, 1614.3366666666666], [1648403100000.0, 1649.0733333333333], [1648403400000.0, 1627.12], [1648403700000.0, 1644.9633333333334], [1648404000000.0, 1653.9033333333334], [1648404300000.0, 1636.6966666666667], [1648404600000.0, 1639.5733333333333], [1648404900000.0, 1627.3866666666668], [1648405200000.0, 1626.3733333333332], [1648405500000.0, 1616.7966666666666], [1648405800000.0, 1667.2933333333333], [1648406100000.0, 1637.0733333333333], [1648406400000.0, 1654.6366666666668], [1648406700000.0, 1673.9566666666667], [1648407000000.0, 1658.4466666666667], [1648407300000.0, 1650.6766666666667], [1648407600000.0, 1662.1933333333334], [1648407900000.0, 1686.9733333333334], [1648408200000.0, 1623.0433333333333], [1648408500000.0, 1630.2866666666666], [1648408800000.0, 1599.0466666666666], [1648409100000.0, 1624.8033333333333], [1648409400000.0, 1606.0333333333333], [1648409700000.0, 1594.15], [1648410000000.0, 1557.1333333333334], [1648410300000.0, 1630.6133333333332], [1648410600000.0, 1591.93], [1648410900000.0, 1579.5733333333333], [1648411200000.0, 1585.1466666666668], [1648411500000.0, 1565.6166666666666], [1648411800000.0, 1566.3366666666666], [1648412100000.0, 1544.1866666666667], [1648412400000.0, 1511.8166666666666], [1648412700000.0, 1525.2333333333333], [1648413000000.0, 1505.57], [1648413300000.0, 1462.9033333333334], [1648413600000.0, 1478.0733333333333], [1648413900000.0, 1460.76], [1648414200000.0, 1504.59], [1648414500000.0, 1460.3366666666666], [1648414800000.0, 1445.9366666666667], [1648415100000.0, 1410.0033333333333], [1648415400000.0, 1412.8466666666666], [1648415700000.0, 1364.8933333333334], [1648416000000.0, 1348.4], [1648416300000.0, 1338.3333333333333], [1648416600000.0, 1326.8633333333332], [1648416900000.0, 1276.24], [1648417200000.0, 1310.0333333333333], [1648417500000.0, 1285.63], [1648417800000.0, 1244.14], [1648418100000.0, 1258.38], [1648418400000.0, 1218.37], [1648418700000.0, 1182.0266666666666], [1648419000000.0, 1196.8133333333333], [1648419300000.0, 1144.54], [1648419600000.0, 1165.62], [1648419900000.0, 1122.0166666666667], [1648420200000.0, 1112.6766666666667], [1648420500000.0, 1102.6], [1648420800000.0, 1095.6966666666667], [1648421100000.0, 1056.63], [1648421400000.0, 1074.5066666666667], [1648421700000.0, 1047.5933333333332], [1648422000000.0, 1057.2633333333333], [1648422300000.0, 1043.99], [1648422600000.0, 1003.4033333333333], [1648422900000.0, 1022.2633333333333], [1648423200000.0, 1016.59], [1648423500000.0, 997.4466666666667], [1648423800000.0, 988.7666666666667], [1648424100000.0, 966.1666666666666], [1648424400000.0, 991.21], [1648424700000.0, 977.6633333333333], [1648425000000.0, 959.64], [1648425300000.0, 961.6989966555184]] But when i try to convert into pandas it convert but the result is what i expected, i expected that the dataframe will make two columns. with ApiClient(configuration) as api_client: api_instance = MetricsApi(api_client) response = api_instance.query_metrics( _from=int(yesterday_start_dt.timestamp()), to=int(yesterday_end_dt.timestamp()), query="default_zero(sum:trace.servlet.request.hits{env:prd-main,service:api}.as_rate())", ) result = response['series'][0]['pointlist'] df = pd.DataFrame(result) print(df) 0 0 [1648339200000.0, 1105.8433333333332] 1 [1648339500000.0, 1093.3266666666666] 2 [1648339800000.0, 1076.92] 3 [1648340100000.0, 1059.5133333333333] 4 [1648340400000.0, 1053.8966666666668] .. ... 283 [1648424100000.0, 966.1666666666666] 284 [1648424400000.0, 991.21] 285 [1648424700000.0, 977.6633333333333] 286 [1648425000000.0, 959.64] 287 [1648425300000.0, 961.6989966555184] [288 rows x 1 columns]
As I pointed out in the comment referring to another answer, here how you may do it. columns = ['point 1', 'point 2'] result = response['series'][0]['pointlist'] df = pd.DataFrame(result,columns=columns)
If you want the first elements of sublists to be in one column, I suggest you create an intermediate np.array and then reshape it into the needed output. array_results=np.array(results) df=pd.DataFrame(array_results.reshape(2, len(results))) following few samples of 'results', here are my outputs
How to optimize PRAW and pandas data collection to make it more pythonic?
I am using PRAW to get data from Reddit and created this function to do so on multiple subreddits. It works, however, I am working on a more concise/pythonic version but can't figure out how I can create a single "for loop", doing the job of the 3 below. subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin') #This function aim to scrap data from a list of subreddit. #From these subreddit, I would like to get the #new, #hot and #rising posts def get_data(size_new, size_hot, size_rising, subs_number): posts = [] followers = [] targeted_date = '14-11-20 12:00:00' targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S') #getting x new posts for subreddit in subs.new(limit = size_new): date = subreddit.created date = datetime.datetime.fromtimestamp(date) if date >= targeted_date: posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext]) #getting x hot posts for subreddit in subs.hot(limit = size_hot): date = subreddit.created date = datetime.datetime.fromtimestamp(date) if date >= targeted_date: posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext]) #getting x rising posts for subreddit in subs.rising(limit = size_rising): date = subreddit.created date = datetime.datetime.fromtimestamp(date) if date >= targeted_date: posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext]) #getting subreddit subscribers number for sub_name in subs_2: for submission in r.subreddit(sub_name).hot(limit = 1): followers.append([submission.subreddit, r.subreddit(sub_name).subscribers]) #creating 2 df df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers']) df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True) #concat the 2 df together df = df.join(df_1.set_index('subreddit'), on = 'subreddit') df = df[["date", "subreddit", "subscribers", "title", 'text']] df = df[df.subscribers > subs_number].reset_index(drop = True) return df My request: how could it be more concise/optimized? What methodology are you using to make your code more readable or even better, optimize it for run time/computational resources? Thank you
There are various principles to make better code, and various tools to use to find the 'code smells' that may be lurking in your code. DRY - Don't Repeat Yourself KISS - keep it stupid simple SOLID etc... Taking a dive into the code that you posted using some of the principles on a surface level would refactor some of your code into looking like: subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin') # check that the date is greater than the target date # return true/false def check_date(subreddit, targeted_date): return subreddit.created >= targeted_date: # get specific post data def get_post_data(subreddit): return [subreddit.created, subreddit.subreddit, subreddit.title, subreddit.selftext] # get posts by sort type def get_subreddit_post_types(subreddit_sort, targeted_date): return [get_post_data(subreddit) for subreddit in subreddit_sort if check_date(subreddit, targeted_date)] #This function aim to scrap data from a list of subreddit. #From these subreddit, I would like to get the #new, #hot and #rising posts def get_data(size_new, size_hot, size_rising, subs_number): targeted_date = '14-11-20 12:00:00' targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S').timestamp() posts = [] followers = [] #getting x new posts posts.extend(get_subreddit_post_types(subs.new(limit = size_new), targeted_date)) #getting x hot posts posts.extend(get_subreddit_post_types(subs.hot(limit = size_hot), targeted_date)) #getting x rising posts posts.extend(get_subreddit_post_types(subs.rising(limit = size_rising), targeted_date)) #getting subreddit subscribers number for sub_name in subs_2: for submission in r.subreddit(sub_name).hot(limit = 1): followers.append([submission.subreddit, r.subreddit(sub_name).subscribers]) #creating 2 df df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers']) df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True) #concat the 2 df together df = df.join(df_1.set_index('subreddit'), on = 'subreddit') df = df[["date", "subreddit", "subscribers", "title", 'text']] df = df[df.subscribers > subs_number].reset_index(drop = True) return df As for better optimizing your computational resources (what are you trying to optimize memory or runtime)? The same process applies to either is to look at your code to see what can be changed to decrease one versus the other. From looking at your code something that would generally optimize what you wrote would be to look at what are the 'duplicate' posts that you are getting. If you could remove the duplicate check (as each of the hot/rising/new get posts from similar date ranges, but hot/rising may be completely encompassed inside of new) call from the posts that you gathered, so that you don't have to check that they are different, and possibly remove hot/rising calls (because those posts may be encompassed in new).
Save geocoding results from address to longitude and latitude to original dataframe in Python
Given a small dataset df as follows: id name address 0 1 ABC tower 北京市朝阳区 1 2 AC park 北京市海淀区 2 3 ZR hospital 上海市黄浦区 3 4 Fengtai library NaN 4 5 Square Point 上海市虹口区 I would like to obtain longitude and latidude for address column and append them to orginal dataframe. Please note there are NaNs in address column. The code below gives me a table with addresses, longitude and latitude, but it ignores the NaN address rows, also the code should be improved: import pandas as pd import requests import json df = df[df['address'].notna()] res = [] for addre in df['address']: url = "http://restapi.amap.com/v3/geocode/geo?key=f057101329c0200f170be166d9b023a1&address=" + addre dat = { 'count': "1", } r = requests.post(url, data = json.dumps(dat)) s = r.json() infos = s['geocodes'] for j in range(0, 10000): # print(j) try: more_infos = infos[j] # print(more_infos) except: continue try: data = more_infos['location'] # print(data) except: continue try: lon_lat = data.split(',') lon = float(lon_lat[0]) lat = float(lon_lat[1]) except: continue res.append([addre, lon, lat]) result = pd.DataFrame(res) result.columns = ['address', 'longitude', 'latitude'] print(result) result.to_excel('result.xlsx', index = False) Out: address longitude latitude 0 北京市朝阳区 116.601144 39.948574 1 北京市海淀区 116.329519 39.972134 2 上海市黄浦区 121.469240 31.229860 3 上海市虹口区 121.505133 31.264600 But how could I get the final result as follows? Thanks for your kind help at advance. id name address longitude latitude 0 1 ABC tower 北京市朝阳区 116.601144 39.948574 1 2 AC park 北京市海淀区 116.329519 39.972134 2 3 ZR hospital 上海市黄浦区 121.469240 31.229860 3 4 Fengtai library NaN NaN NaN 4 5 Square Point 上海市虹口区 121.505133 31.264600
use pd.merge, as result is the longitude & latitude dataframe. dfn = pd.merge(df, result, on='address', how='left') or for _, row in df.iterrows(): _id = row['id'] name = row['name'] addre = row['address'] if pd.isna(row['address']): res.append([_id, name, addre, None, None]) continue ###### same code ###### url = '...' # ... ###### same code ###### res.append([_id, name, addre, lon, lat]) result = pd.DataFrame(res) result.columns = ['id', 'name', 'address', 'longitude', 'latitude'] print(result) result.to_excel('result.xlsx', index = False)
How may I append new results from iterating through a list, into a new column in the dataframe
Im attempting to create a table as follows, where equities in a list get appended as columns to the dataframe: Fundamentals CTRP EBAY ...... MPNGF price dividend five_year_dividend pe_ratio pegRatio priceToBook price_to_sales book_value ebit net_income EPS DebtEquity threeYearAverageReturn At the moment, based on the code below, only the last equity in the list is showing up: Fundamentals MPNGF price dividend five_year_dividend pe_ratio pegRatio priceToBook price_to_sales book_value ebit net_income EPS DebtEquity threeYearAverageReturn from yahoofinancials import YahooFinancials import pandas as pd import lxml from lxml import html import requests import numpy as np from datetime import datetime def scrape_table(url): page = requests.get(url) tree = html.fromstring(page.content) table = tree.xpath('//table') assert len(table) == 1 df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0] df = df.set_index(0) df = df.dropna() df = df.transpose() df = df.replace('-', '0') df[df.columns[0]] = pd.to_datetime(df[df.columns[0]]) cols = list(df.columns) cols[0] = 'Date' df = df.set_axis(cols, axis='columns', inplace=False) numeric_columns = list(df.columns)[1::] df[numeric_columns] = df[numeric_columns].astype(np.float64) return df ecommerce = ['CTRP', 'EBAY', 'GRUB', 'BABA', 'JD', 'EXPE', 'AMZN', 'BKNG', 'MPNGF'] price=[] dividend=[] five_year_dividend=[] pe_ratio=[] pegRatio=[] priceToBook=[] price_to_sales=[] book_value=[] ebit=[] net_income=[] EPS=[] DebtEquity=[] threeYearAverageReturn=[] for i, symbol in enumerate(ecommerce): yahoo_financials = YahooFinancials(symbol) balance_sheet_url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol df_balance_sheet = scrape_table(balance_sheet_url) df_balance_sheet_de = pd.DataFrame(df_balance_sheet, columns = ["Total Liabilities", "Total stockholders' equity"]) j= df_balance_sheet_de.loc[[1]] j['DebtEquity'] = j["Total Liabilities"]/j["Total stockholders' equity"] k= j.iloc[0]['DebtEquity'] X = yahoo_financials.get_key_statistics_data() for d in X.values(): PEG = d['pegRatio'] PB = d['priceToBook'] three_year_ave_return = d['threeYearAverageReturn'] data = [['price', yahoo_financials.get_current_price()], ['dividend', yahoo_financials.get_dividend_yield()], ['five_year_dividend', yahoo_financials.get_five_yr_avg_div_yield()], ['pe_ratio', yahoo_financials.get_pe_ratio()], ['pegRatio', PEG], ['priceToBook', PB], ['price_to_sales', yahoo_financials.get_price_to_sales()], ['book_value', yahoo_financials.get_book_value()], ['ebit', yahoo_financials.get_ebit()], ['net_income', yahoo_financials.get_net_income()], ['EPS', yahoo_financials.get_earnings_per_share()], ['DebtEquity', mee], ['threeYearAverageReturn', three_year_ave_return]] data.append(symbol.text) df = pd.DataFrame(data, columns = ['Fundamentals', symbol]) df Seeking your kind advice please as to where may i have gone wrong in the above table? Thank you so very much!
You need to call your df outside of your for loop. Your code as currently written will recreate a new df for every loop.
Multiple variables loop and append dataframe
I am trying to loop over 2 lists to get all combinations possible in the loop below. I have some difficulties to understand why the first part works and the second does not. Basically it query the same data but with all pattern from the lists. Any help would be very appreciated. THE CODE: base = ['BTC', 'ETH'] quoted = ['USDT', 'AUD','USD'] def daily_volume_historical(symbol, comparison_symbol, all_data=False, limit=90, aggregate=1, exchange=''): url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\ .format(symbol.upper(), comparison_symbol.upper(), limit, aggregate) if exchange: url += '&e={}'.format(exchange) if all_data: url += '&allData=true' page = requests.get(url) data = page.json()['Data'] df = pd.DataFrame(data) df.drop(df.index[-1], inplace=True) df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time] df.set_index('timestamp') return df ## THIS CODE GIVES SOME DATA ## volu = daily_volume_historical('BTC', 'USD', 'CCCAGG').set_index('timestamp').volumefrom ## THIS CODE GIVES EMPTY DATA FRAME ## d_volu = [] for a,b in [(a,b) for a in base for b in quoted]: volu = daily_volume_historical(a, b, exchange= 'CCCAGG').volumefrom d_volu.append d_volu = pd.concat(d_volu, axis=1) volu output sample: timestamp 2010-07-17 09:00:00 20.00 2010-07-18 09:00:00 75.01 2010-07-19 09:00:00 574.00 2010-07-20 09:00:00 262.00 2010-07-21 09:00:00 575.00 2010-07-22 09:00:00 2160.00 2010-07-23 09:00:00 2402.50 2010-07-24 09:00:00 496.32
import itertools base = ['BTC', 'ETH'] quoted = ['USDT', 'AUD','USD'] combinations = list(itertools.product(base, quoted)) def daily_volume_historical(symbol, comparison_symbol, all_data=False, limit=90, aggregate=1, exchange=''): url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\ .format(symbol.upper(), comparison_symbol.upper(), limit, aggregate) if exchange: url += '&e={}'.format(exchange) if all_data: url += '&allData=true' page = requests.get(url) data = page.json()['Data'] df = pd.DataFrame(data) df.drop(df.index[-1], inplace=True) df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time] df.set_index('timestamp') return df ## THIS CODE GIVES SOME DATA ## volu = daily_volume_historical('BTC', 'USD', 'CCCAGG').set_index('timestamp').volumefrom ## THIS CODE GIVES EMPTY DATA FRAME ## d_volu = [] for a,b in combinations: volu = daily_volume_historical(a, b, exchange= 'CCCAGG').volumefrom d_volu.append d_volu = pd.concat(d_volu, axis=1)