The following is a python code which prints live data from an API of a data feed vendor. I want the data in the panda's data frame but it prints only the following result
"Empty DataFrame
Columns: []
Index: []"
from truedata_ws.websocket.TD import TD
import time
import pandas as pd
username = ''
password = ''
realtime_port = 8084
url = 'push.truedata.in'
symbols = []
td_obj = TD(username, password, live_port=realtime_port, url=url, log_level=logging.DEBUG, log_format="%(message)s")
print('\nStarting Real Time Feed.... ')
req_ids = td_obj.start_live_data(symbols)
live_data_objs = {}
time.sleep(1)
for req_id in req_ids:
print(f'touchlinedata -> {td_obj.touchline_data[req_id]}')
df=pd.DataFrame(live_data_objs)
print(df)
#td_obj.trade_callback
def strategy_callback(symbol_id, tick_data):
print(f'Trade update > {tick_data}')
while True:
time.sleep(120)
In your code, you pass an empty dictionary as an argument for creating a Data-frame, the Data-Frame you will get back for passing an empty dictionary will be Empty
Related
I'll just list the two bugs I know as of now, and if you have any recommendations for refactoring my code let me know I'll go ahead and list out the few known issues as of now.
yfinance is not appending the dividendYield to my dict, I did make sure that their is an actual Dividend Yield for those Symbols.
TypeError: can only concatenate str (not "Tag") to str which I assume is something to do with how it parsing through the xml, and it ran into a tag so I am not able to create the expander, I thought I could solve it with this if statement, but instead I just don't get any expander at all.
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
Full code for main.py:
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
# Get the entities from each heading, link it with nasdaq data // if possible, and Extract market data with yfinance.
stock_dict = {
'Org': [],
'Symbol': [],
'currentPrice': [],
'dayHigh': [],
'dayLow': [],
'forwardPE': [],
'dividendYield': []
}
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
stock_info = yf.Ticker(symbol).info
print(symbol)
stock_dict['Org'].append(org_name)
stock_dict['Symbol'].append(symbol)
stock_dict['currentPrice'].append(
stock_info['currentPrice'])
stock_dict['dayHigh'].append(stock_info['dayHigh'])
stock_dict['dayLow'].append(stock_info['dayLow'])
stock_dict['forwardPE'].append(stock_info['forwardPE'])
stock_dict['dividendYield'].append(
stock_info['dividendYield'])
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame.from_dict(stock_dict, orient='index')
output_df = output_df.transpose()
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
print(fin_headings)
# Output financial info
output_df = stock_info(fin_headings)
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
There is an issue in your logic in stock_info function because of which same symbol is getting different values and when you are cleaning the duplicate, based on occurrence of the symbol its retaining the row with first occurrence of symbol.
The below code will solve both of your issues.
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
stock_info_list = []
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
print(symbol)
stock_info = yf.Ticker(symbol).info
stock_info['Org'] = org_name
stock_info['Symbol'] = symbol
stock_info_list.append(stock_info)
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame(stock_info_list)
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
output_df = stock_info(fin_headings)
output_df = output_df[['Org','Symbol','currentPrice','dayHigh','dayLow','forwardPE','dividendYield']]
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
heading = heading.text
if type(heading) == str:
st.markdown("* " + heading)
else:
pass
For issue #2 the patch code that you posted has a small mistake. Rather than checking if heading == str, which does something completely different than you intended and will always be False, you want to check if isinstance(heading, str). That way you get True if heading is a string and False if not. However, even then, it should not be a solution as heading is not a string. Instead you want to call get_text on heading to get the actual text part of the parsed object.
heading.get_text()
More information would be needed to solve issue #1. What does stock_dict look like before you create the Dataframe out of it? Specifically, what values are in stock_dict['dividendYield']? Can you print it and add it to your question?
Also, about the refactoring part. An
else:
pass
block does completely nothing and should be deleted. (When the if condition is false nothing happens anyways)
I am extracting Reddit data via the Pushshift API. More precisely, I am interested in comments and posts (submissions) in subreddit X with search word Y, made from now until datetime Z (e.g. all comments mentioning "GME" in subreddit /rwallstreetbets). All these parameters can be specified. So far, I got it working with the following code:
import pandas as pd
import requests
from datetime import datetime
import traceback
import time
import json
import sys
import numpy as np
username = "" # put the username you want to download in the quotes
subreddit = "gme" # put the subreddit you want to download in the quotes
search_query = "gamestop" # put the word you want to search for (present in comment or post) in the quotes
# leave either one blank to download an entire user's, subreddit's, or search word's history
# or fill in all to download a specific users history from a specific subreddit mentioning a specific word
filter_string = None
if username == "" and subreddit == "" and search_query == "":
print("Fill in either username or subreddit")
sys.exit(0)
elif username == "" and subreddit != "" and search_query == "":
filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "" and search_query == "":
filter_string = f"author={username}"
elif username == "" and subreddit != "" and search_query != "":
filter_string = f"subreddit={subreddit}&q={search_query}"
elif username == "" and subreddit == "" and search_query != "":
filter_string = f"q={search_query}"
else:
filter_string = f"author={username}&subreddit={subreddit}&q={search_query}"
url = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before="
start_time = datetime.utcnow()
def redditAPI(object_type):
global df_comments
df_comments = pd.DataFrame(columns=["date", "comment", "score", "id"])
global df_posts
df_posts = pd.DataFrame(columns=["date", "post", "score", "id"])
print(f"\nLooping through {object_type}s and append to dataframe...")
count = 0
previous_epoch = int(start_time.timestamp())
while True:
# Ensures that loop breaks at March 16 2021 for testing purposes
if previous_epoch <= 1615849200:
break
new_url = url.format(object_type, filter_string)+str(previous_epoch)
json_text = requests.get(new_url)
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json.loads(json_text.text)
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
df2 = pd.DataFrame.from_dict(objects)
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == "comment":
df2.rename(columns={'created_utc': 'date', 'body': 'comment'}, inplace=True)
df_comments = df_comments.append(df2[['date', 'comment', 'score']])
elif object_type == "submission":
df2.rename(columns={'created_utc': 'date', 'selftext': 'post'}, inplace=True)
df_posts = df_posts.append(df2[['date', 'post', 'score']])
# Convert UNIX to datetime
df_comments["date"] = pd.to_datetime(df_comments["date"],unit='s')
df_posts["date"] = pd.to_datetime(df_posts["date"],unit='s')
# Drop blank rows (the case when posts only consists of an image)
df_posts['post'].replace('', np.nan, inplace=True)
df_posts.dropna(subset=['post'], inplace=True)
# Drop duplicates (see last comment on https://www.reddit.com/r/pushshift/comments/b7onr6/max_number_of_results_returned_per_query/)
df_comments = df_comments.drop_duplicates()
df_posts = df_posts.drop_duplicates()
print("\nDone. Saved to dataframe.")
Unfortunately, I do have some performance issues. Due to the fact that I paginate based on created_utc - 1 (and since I do not want to miss any comments/posts), the initial dataframe will contain duplicates (since there won't be 100 (=API limit) new comments/posts every new second). If I run the code for a long time frame (e.g. current time - 1 March 2021), this will result in a huge dataframe which takes considerably long to process.
As the code is right now, the duplicates are added to the dataframe and only after the loop, they are removed. Is there a way to make this more efficient? E.g. to check within the for loop whether the object already exists in the dataframe? Would this make a difference, performance wise? Any input would be very much appreciated.
It is possible to query the data so that there are no duplicates in the first place.
You are using the before parameter of the API, allowing to get only records strictly before the timestamp. This means we can send as before on each iteration the timestamp of the earliest record that we already have. In this case in response we are only gonna have records that we haven't seen yet, so no duplicates.
In code that would look something like this:
import pandas as pd
import requests
import urllib
import time
import json
def get_data(object_type, username='', subreddit='', search_query='', max_time=None, min_time=1615849200):
# start from current time if not specified
if max_time is None:
max_time = int(time.time())
# generate filter string
filter_string = urllib.parse.urlencode(
{k: v for k, v in zip(
['author', 'subreddit', 'q'],
[username, subreddit, search_query]) if v != ""})
url_format = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before={}"
before = max_time
df = pd.DataFrame()
while before > min_time:
url = url_format.format(object_type, filter_string, before)
resp = requests.get(url)
# convert records to dataframe
dfi = pd.json_normalize(json.loads(resp.text)['data'])
if object_type == 'comment':
dfi = dfi.rename(columns={'created_utc': 'date', 'body': 'comment'})
df = pd.concat([df, dfi[['id', 'date', 'comment', 'score']]])
elif object_type == 'submission':
dfi = dfi.rename(columns={'created_utc': 'date', 'selftext': 'post'})
dfi = dfi[dfi['post'].ne('')]
df = pd.concat([df, dfi[['id', 'date', 'post', 'score']]])
# set `before` to the earliest comment/post in the results
# next time we call requests.get(...) we will only get comments/posts before
# the earliest that we already have, thus not fetching any duplicates
before = dfi['date'].min()
# if needed
# time.sleep(1)
return df
Testing by getting the comments and checking for duplicate values (by id):
username = ""
subreddit = "gme"
search_query = "gamestop"
df_comments = get_data(
object_type='comment',
username=username,
subreddit=subreddit,
search_query=search_query)
df_comments['id'].duplicated().any() # False
df_comments['id'].nunique() # 2200
I would suggest a bloom filter to check if values have already been passed through.
There is a package on PyPi, which implements this very easily. To use the bloom filter you just have to add a "key" to the filter, this can be a combination of the username and comment. This way you can check if you have already added comments to your data frame. I suggest that you use the bloom filter as early as possible in your method, i.e. after you get a response from the API.
I am creating a web scraping program using python, BeautifulSoup, pandas and Google Sheets.
Up until now I have managed to scrape data tables from urls which I’m getting from a list in Google sheets - I have created data frames for each dataset. From my list of urls, some of the cells in the column is empty, which gives me the following error when I try to import the dataframes into another sheet:
MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant
http://?
What I’d like to achieve is, that for every cell that’s empty in the sheets with urls, I would like to create an empty dataframe, just like the ones with data inside them. Is that possible?
My code so far looks like this:
import gspread
from df2gspread import df2gspread as d2g
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
credentials = service_account.Credentials.from_service_account_file(
'credentials.json')
scoped_credentials = credentials.with_scopes(
['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
)
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key('api_key')
# Data import
data_worksheet = spreadsheet_key.worksheet("Data")
# Url's
url_worksheet = spreadsheet_key.worksheet("Urls")
link_list = url_worksheet.col_values(2)
def get_info(linkIndex) :
page = requests.get(link_list[linkIndex])
soup = BeautifulSoup(page.content, 'html.parser')
try :
tbl = soup.find('table')
labels = []
results = []
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(data_worksheet, df, col=(linkIndex*6)+1, row=2,
include_column_header=False)[1:]
except Exception as e:
print(e)
for linkInd in range(len(link_list))[1:] :
get_info(linkInd)
It depends on what do you mean by an empty dataframe. If that's dataframe containing no data, it can be created with statement pd.DataFrame(). If that's dataframe containing np.NaN / None values in same columns as other dataframes, it can be created from a dict:
import pandas as pd
# x is the amount of rows in dataframe
d = {
'column1': [np.NaN] * x,
'column2': [np.NaN] * x,
'column3': [np.NaN] * x
}
df = pd.DataFrame(d)
In the beginning of get_info() function there should be a check added:
if link_list[linkIndex] is not None: # or if link_list[linkIndex] != '' (depending on format of an empty cell)
In if section should be placed already existing logic, in else section an empty dataframe should be created. Function set_with_dataframe() should be called after if / else statement, because it's executed in both cases.
I do know that python has the read_json function to effectively get data from an api into a pandas dataframe. But is there any way to actually read through all the pages of the api and input it into the same dataframe.
import requests
import pandas as pd
import config
api_key = config.api_key
url = " http://api.themoviedb.org/3/discover/movie?release_date.gte=2017-12-
01&release_date.lte=2017-12-31&api_key=" + api_key
payload = "{}"
response = requests.request("GET", url, data=payload)
print(response.text.encode("utf-8"))
I tried with the requests method but this only gives me the 1st page of the api. But I wanted to see if there is any way I can do it with the df method as below. I am unable to understand how to write a loop to effectively loop over all the pages and then input it all into 1 dataframe for further analysis.
df = pd.read_json('http://api.themoviedb.org/3/discover/movie?
release_date.gte=2017-12-01&release_date.lte=2017-12-
31&api_key=''&page=%s' %page)
You can read each page into a dataframe and concatenate them:
page = 0
df = []
while True:
try:
next_page = pd.read_json('http://api.themoviedb.org/3/discover/movie?
release_date.gte=2017-12-01&release_date.lte=2017-12-
31&api_key=''&page=%s' %page)
# doesn't get any content, stop
if len(next_page) == 0:
break
else:
# move on to the next page
df.append(next_page)
page += 1
except:
# if we got error from the API call, maybe the URL for that page doesn't exist
# the stop
break
df = pd.concat(df, axis=0)
Documentation for pd.concat here. Hope it helps :)
I am trying to get some results of variables and insert them into a pandas Dataframe. I am getting some data from the Merchant API from Google and I want to write them into a pandas DataFrame.
Initially I am executing calls to the APIs and then use while ... continue to reiterate over the loop and take the next page's results.
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import httplib2
import pandas as pd
def get_products(merchant_id):
request = service.products().list(merchantId=merchant_id,
maxResults=250)
while request is not None:
result = request.execute()
for product in result['resources']:
product_id = product['id']
availability = product['availability']
category = product['productType'].split('>')
category = category[0]
return product_id,availability,category
request = service.products().list_next(request, result)
continue
if __name__ == '__main__':
result = get_products('yyyy')
print result
data = {'product_id': result[0], 'availability': result[1], 'category': result[2]}
products_df = pd.DataFrame(data,index=[0])
print products_df
However in my output DataFrame I get only one line for some reason..
(u'online:CH:480089', u'in stock', u'Spielzeug ')
When I test in the defined function I have thousands of rows.
Do you see something wrong in my logic?
Stripping out all of the other statements and keeping just the important bits:
def get_products(merchant_id):
while request is not None:
for product in result['resources']:
return product_id,availability,category # The first product will be returned by the function
continue # essentially a NOP
You loop through the request and return the first product you find. You need to aggregate the results and then return that aggregation:
def get_products(merchant_id):
result = []
while request is not None:
for product in result['resources']:
result.append((product_id,availability,category))
return result
which will return a list of tuples for every product... although take note your data = {} will not work with this format.