I'll just list the two bugs I know as of now, and if you have any recommendations for refactoring my code let me know I'll go ahead and list out the few known issues as of now.
yfinance is not appending the dividendYield to my dict, I did make sure that their is an actual Dividend Yield for those Symbols.
TypeError: can only concatenate str (not "Tag") to str which I assume is something to do with how it parsing through the xml, and it ran into a tag so I am not able to create the expander, I thought I could solve it with this if statement, but instead I just don't get any expander at all.
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
Full code for main.py:
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
# Get the entities from each heading, link it with nasdaq data // if possible, and Extract market data with yfinance.
stock_dict = {
'Org': [],
'Symbol': [],
'currentPrice': [],
'dayHigh': [],
'dayLow': [],
'forwardPE': [],
'dividendYield': []
}
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
stock_info = yf.Ticker(symbol).info
print(symbol)
stock_dict['Org'].append(org_name)
stock_dict['Symbol'].append(symbol)
stock_dict['currentPrice'].append(
stock_info['currentPrice'])
stock_dict['dayHigh'].append(stock_info['dayHigh'])
stock_dict['dayLow'].append(stock_info['dayLow'])
stock_dict['forwardPE'].append(stock_info['forwardPE'])
stock_dict['dividendYield'].append(
stock_info['dividendYield'])
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame.from_dict(stock_dict, orient='index')
output_df = output_df.transpose()
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
print(fin_headings)
# Output financial info
output_df = stock_info(fin_headings)
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
There is an issue in your logic in stock_info function because of which same symbol is getting different values and when you are cleaning the duplicate, based on occurrence of the symbol its retaining the row with first occurrence of symbol.
The below code will solve both of your issues.
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
stock_info_list = []
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
print(symbol)
stock_info = yf.Ticker(symbol).info
stock_info['Org'] = org_name
stock_info['Symbol'] = symbol
stock_info_list.append(stock_info)
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame(stock_info_list)
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
output_df = stock_info(fin_headings)
output_df = output_df[['Org','Symbol','currentPrice','dayHigh','dayLow','forwardPE','dividendYield']]
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
heading = heading.text
if type(heading) == str:
st.markdown("* " + heading)
else:
pass
For issue #2 the patch code that you posted has a small mistake. Rather than checking if heading == str, which does something completely different than you intended and will always be False, you want to check if isinstance(heading, str). That way you get True if heading is a string and False if not. However, even then, it should not be a solution as heading is not a string. Instead you want to call get_text on heading to get the actual text part of the parsed object.
heading.get_text()
More information would be needed to solve issue #1. What does stock_dict look like before you create the Dataframe out of it? Specifically, what values are in stock_dict['dividendYield']? Can you print it and add it to your question?
Also, about the refactoring part. An
else:
pass
block does completely nothing and should be deleted. (When the if condition is false nothing happens anyways)
Related
The following is a python code which prints live data from an API of a data feed vendor. I want the data in the panda's data frame but it prints only the following result
"Empty DataFrame
Columns: []
Index: []"
from truedata_ws.websocket.TD import TD
import time
import pandas as pd
username = ''
password = ''
realtime_port = 8084
url = 'push.truedata.in'
symbols = []
td_obj = TD(username, password, live_port=realtime_port, url=url, log_level=logging.DEBUG, log_format="%(message)s")
print('\nStarting Real Time Feed.... ')
req_ids = td_obj.start_live_data(symbols)
live_data_objs = {}
time.sleep(1)
for req_id in req_ids:
print(f'touchlinedata -> {td_obj.touchline_data[req_id]}')
df=pd.DataFrame(live_data_objs)
print(df)
#td_obj.trade_callback
def strategy_callback(symbol_id, tick_data):
print(f'Trade update > {tick_data}')
while True:
time.sleep(120)
In your code, you pass an empty dictionary as an argument for creating a Data-frame, the Data-Frame you will get back for passing an empty dictionary will be Empty
I am extracting Reddit data via the Pushshift API. More precisely, I am interested in comments and posts (submissions) in subreddit X with search word Y, made from now until datetime Z (e.g. all comments mentioning "GME" in subreddit /rwallstreetbets). All these parameters can be specified. So far, I got it working with the following code:
import pandas as pd
import requests
from datetime import datetime
import traceback
import time
import json
import sys
import numpy as np
username = "" # put the username you want to download in the quotes
subreddit = "gme" # put the subreddit you want to download in the quotes
search_query = "gamestop" # put the word you want to search for (present in comment or post) in the quotes
# leave either one blank to download an entire user's, subreddit's, or search word's history
# or fill in all to download a specific users history from a specific subreddit mentioning a specific word
filter_string = None
if username == "" and subreddit == "" and search_query == "":
print("Fill in either username or subreddit")
sys.exit(0)
elif username == "" and subreddit != "" and search_query == "":
filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "" and search_query == "":
filter_string = f"author={username}"
elif username == "" and subreddit != "" and search_query != "":
filter_string = f"subreddit={subreddit}&q={search_query}"
elif username == "" and subreddit == "" and search_query != "":
filter_string = f"q={search_query}"
else:
filter_string = f"author={username}&subreddit={subreddit}&q={search_query}"
url = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before="
start_time = datetime.utcnow()
def redditAPI(object_type):
global df_comments
df_comments = pd.DataFrame(columns=["date", "comment", "score", "id"])
global df_posts
df_posts = pd.DataFrame(columns=["date", "post", "score", "id"])
print(f"\nLooping through {object_type}s and append to dataframe...")
count = 0
previous_epoch = int(start_time.timestamp())
while True:
# Ensures that loop breaks at March 16 2021 for testing purposes
if previous_epoch <= 1615849200:
break
new_url = url.format(object_type, filter_string)+str(previous_epoch)
json_text = requests.get(new_url)
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json.loads(json_text.text)
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
df2 = pd.DataFrame.from_dict(objects)
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == "comment":
df2.rename(columns={'created_utc': 'date', 'body': 'comment'}, inplace=True)
df_comments = df_comments.append(df2[['date', 'comment', 'score']])
elif object_type == "submission":
df2.rename(columns={'created_utc': 'date', 'selftext': 'post'}, inplace=True)
df_posts = df_posts.append(df2[['date', 'post', 'score']])
# Convert UNIX to datetime
df_comments["date"] = pd.to_datetime(df_comments["date"],unit='s')
df_posts["date"] = pd.to_datetime(df_posts["date"],unit='s')
# Drop blank rows (the case when posts only consists of an image)
df_posts['post'].replace('', np.nan, inplace=True)
df_posts.dropna(subset=['post'], inplace=True)
# Drop duplicates (see last comment on https://www.reddit.com/r/pushshift/comments/b7onr6/max_number_of_results_returned_per_query/)
df_comments = df_comments.drop_duplicates()
df_posts = df_posts.drop_duplicates()
print("\nDone. Saved to dataframe.")
Unfortunately, I do have some performance issues. Due to the fact that I paginate based on created_utc - 1 (and since I do not want to miss any comments/posts), the initial dataframe will contain duplicates (since there won't be 100 (=API limit) new comments/posts every new second). If I run the code for a long time frame (e.g. current time - 1 March 2021), this will result in a huge dataframe which takes considerably long to process.
As the code is right now, the duplicates are added to the dataframe and only after the loop, they are removed. Is there a way to make this more efficient? E.g. to check within the for loop whether the object already exists in the dataframe? Would this make a difference, performance wise? Any input would be very much appreciated.
It is possible to query the data so that there are no duplicates in the first place.
You are using the before parameter of the API, allowing to get only records strictly before the timestamp. This means we can send as before on each iteration the timestamp of the earliest record that we already have. In this case in response we are only gonna have records that we haven't seen yet, so no duplicates.
In code that would look something like this:
import pandas as pd
import requests
import urllib
import time
import json
def get_data(object_type, username='', subreddit='', search_query='', max_time=None, min_time=1615849200):
# start from current time if not specified
if max_time is None:
max_time = int(time.time())
# generate filter string
filter_string = urllib.parse.urlencode(
{k: v for k, v in zip(
['author', 'subreddit', 'q'],
[username, subreddit, search_query]) if v != ""})
url_format = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before={}"
before = max_time
df = pd.DataFrame()
while before > min_time:
url = url_format.format(object_type, filter_string, before)
resp = requests.get(url)
# convert records to dataframe
dfi = pd.json_normalize(json.loads(resp.text)['data'])
if object_type == 'comment':
dfi = dfi.rename(columns={'created_utc': 'date', 'body': 'comment'})
df = pd.concat([df, dfi[['id', 'date', 'comment', 'score']]])
elif object_type == 'submission':
dfi = dfi.rename(columns={'created_utc': 'date', 'selftext': 'post'})
dfi = dfi[dfi['post'].ne('')]
df = pd.concat([df, dfi[['id', 'date', 'post', 'score']]])
# set `before` to the earliest comment/post in the results
# next time we call requests.get(...) we will only get comments/posts before
# the earliest that we already have, thus not fetching any duplicates
before = dfi['date'].min()
# if needed
# time.sleep(1)
return df
Testing by getting the comments and checking for duplicate values (by id):
username = ""
subreddit = "gme"
search_query = "gamestop"
df_comments = get_data(
object_type='comment',
username=username,
subreddit=subreddit,
search_query=search_query)
df_comments['id'].duplicated().any() # False
df_comments['id'].nunique() # 2200
I would suggest a bloom filter to check if values have already been passed through.
There is a package on PyPi, which implements this very easily. To use the bloom filter you just have to add a "key" to the filter, this can be a combination of the username and comment. This way you can check if you have already added comments to your data frame. I suggest that you use the bloom filter as early as possible in your method, i.e. after you get a response from the API.
I suspect this isn't very complicated, but I can't see to figure it out. I'm using Selenium and Beautiful Soup to parse Petango.com. Data will be used to help a local shelter understand how they compare in different metrics to other area shelters. so next will be taking these data frames and doing some analysis.
I grab detail urls from a different module and import the list here.
My issue is, my lists are only showing the value from the HTML from the first dog. I was stepping through and noticed my len are different for the soup iterations, so I realize my error is after that somewhere but I can't figure out how to fix.
Here is my code so far (running the whole process vs using a cached page)
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from Petango import pet_links
headings = []
values = []
ShelterInfo = []
ShelterInfoWebsite = []
ShelterInfoEmail = []
ShelterInfoPhone = []
ShelterInfoAddress = []
Breed = []
Age = []
Color = []
SpayedNeutered = []
Size = []
Declawed = []
AdoptionDate = []
# to access sites, change url list to pet_links (break out as needed) and change if false to true. false looks to the html file
url_list = (pet_links[4], pet_links[6], pet_links[8])
#url_list = ("Petango.html", "Petango.html", "Petango.html")
for link in url_list:
page_source = None
if True:
#pet page = link should populate links from above, hard code link was for 1 detail page, =to hemtl was for cached site
PetPage = link
#PetPage = 'https://www.petango.com/Adopt/Dog-Terrier-American-Pit-Bull-45569732'
#PetPage = Petango.html
PetDriver = webdriver.Chrome(executable_path='/Users/paulcarson/Downloads/chromedriver')
PetDriver.implicitly_wait(30)
PetDriver.get(link)
page_source = PetDriver.page_source
PetDriver.close()
else:
with open("Petango.html",'r') as f:
page_source = f.read()
PetSoup = BeautifulSoup(page_source, 'html.parser')
print(len(PetSoup.text))
#get the details about the shelter and add to lists
ShelterInfo.append(PetSoup.find('div', class_ = "DNNModuleContent ModPethealthPetangoDnnModulesShelterShortInfoC").find('h4').text)
ShelterInfoParagraphs = PetSoup.find('div', class_ = "DNNModuleContent ModPethealthPetangoDnnModulesShelterShortInfoC").find_all('p')
First_Paragraph = ShelterInfoParagraphs[0]
if "Website" not in First_Paragraph.text:
raise AssertionError("first paragraph is not about site")
ShelterInfoWebsite.append(First_Paragraph.find('a').text)
Second_Paragraph = ShelterInfoParagraphs[1]
ShelterInfoEmail.append(Second_Paragraph.find('a')['href'])
Third_Paragraph = ShelterInfoParagraphs[2]
ShelterInfoPhone.append(Third_Paragraph.find('span').text)
Fourth_Paragraph = ShelterInfoParagraphs[3]
ShelterInfoAddress.append(Fourth_Paragraph.find('span').text)
#get the details about the pet
ul = PetSoup.find('div', class_='group details-list').ul # Gets the ul tag
li_items = ul.find_all('li') # Finds all the li tags within the ul tag
for li in li_items:
heading = li.strong.text
headings.append(heading)
value = li.span.text
if value:
values.append(value)
else:
values.append(None)
Breed.append(values[0])
Age.append(values[1])
print(Age)
Color.append(values[2])
SpayedNeutered.append(values[3])
Size.append(values[4])
Declawed.append(values[5])
AdoptionDate.append(values[6])
ShelterDF = pd.DataFrame(
{
'Shelter': ShelterInfo,
'Shelter Website': ShelterInfoWebsite,
'Shelter Email': ShelterInfoEmail,
'Shelter Phone Number': ShelterInfoPhone,
'Shelter Address': ShelterInfoAddress
})
PetDF = pd.DataFrame(
{'Breed': Breed,
'Age': Age,
'Color': Color,
'Spayed/Neutered': SpayedNeutered,
'Size': Size,
'Declawed': Declawed,
'Adoption Date': AdoptionDate
})
print(PetDF)
print(ShelterDF)
output from print len and print the age value as the loop progresses
12783
['6y 7m']
10687
['6y 7m', '6y 7m']
10705
['6y 7m', '6y 7m', '6y 7m']
Could someone please point me in the right direction?
Thank you for your help!
Paul
You need to change the find method into find_all() in BeautifulSoup so it locates all the elements.
Values is global and you only ever append the first value in this list to Age
Age.append(values[1])
Same problem for your other global lists (static index whether 1 or 2 etc...).
You need a way to track the appropriate index to use perhaps through a counter, or determine other logic to ensure current value is added e.g. with current Age, is it is the second li in the loop? Or just append PetSoup.select_one("[data-bind='text: age']").text
It looks like each item of interest e.g. colour, spayed contains the data-bind attribute so you can use those with the appropriate attribute value to select each value and avoid a loop over li elements.
e.g. current_colour = PetSoup.select_one("[data-bind='text: color']").text
Best to set in a variable and test is not None before accessing with .text
I am working on a project, and I need to remove the left and right most character of a data result. The data forms a scrape of craigslist, and the neighborhood results return as '(####)', but what I need it to be is ####. I am using pandas, and trying to use lstrip & rstrip. When I attempt it inside the python shell, it works, but when I use it on my data it does not work.
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
For some reason, the rstrip, does work and removes the ')' but the lstrip does not.
The full code is:
from bs4 import BeautifulSoup
import json
from requests import get
import numpy as np
import pandas as pd
import csv
print('hello world')
#get the initial page for the listings, to get the total count
response = get('https://washingtondc.craigslist.org/search/hhh?query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
results = html_result.find('div', class_='search-legend')
total = int(results.find('span',class_='totalcount').text)
pages = np.arange(0,total+1,120)
neighborhood = []
bedroom_count =[]
sqft = []
price = []
link = []
for page in pages:
#print(page)
response = get('https://washingtondc.craigslist.org/search/hhh?s='+str(page)+'query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
posts = html_result.find_all('li', class_='result-row')
for post in posts:
if post.find('span',class_='result-hood') is not None:
post_url = post.find('a',class_='result-title hdrlnk')
post_link = post_url['href']
link.append(post_link)
post_neighborhood = post.find('span',class_='result-hood').text
post_price = int(post.find('span',class_='result-price').text.strip().replace('$',''))
neighborhood.append(post_neighborhood)
price.append(post_price)
if post.find('span',class_='housing') is not None:
if 'ft2' in post.find('span',class_='housing').text.split()[0]:
post_bedroom = np.nan
post_footage = post.find('span',class_='housing').text.split()[0][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())>2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = post.find('span',class_='housing').text.split()[2][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())==2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
else:
post_bedroom = np.nan
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
#create results data frame
post_results = pd.DataFrame({'neighborhood':neighborhood,'footage':sqft,'bedroom':bedroom_count,'price':price,'link':link})
#clean up results
post_results.drop_duplicates(subset='link')
post_results['footage'] = post_results['footage'].replace(0,np.nan)
post_results['bedroom'] = post_results['bedroom'].replace(0,np.nan)
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
post_results = post_results.dropna(subset=['footage','bedroom'],how='all')
post_results.to_csv("rent_clean.csv",index=False)
print(len(post_results.index))
This problem will happened when you have whitespace in the front
For example :
s=pd.Series([' (xxxx)','(yyyy) '])
s.str.strip('(|)')
0 (xxxx
1 yyyy)
dtype: object
What we can do is strip twice
s.str.strip().str.strip('(|)')
0 xxxx
1 yyyy
dtype: object
From my understanding of your question, you are removing characters from a string. You don't need pandas for this. Strings have a length and you can remove the first and last character like this;
new_word = old_word[1:-1]
This should work for you. Good luck.
I am trying to obtain a stock's current price, and then put it into a variable to run if / else statements on. I have used the Google API to retrieve current stock prices, but I am unable to figure out how to put it into a variable. Thanks!
import json
import sys
try:
from urllib.request import Request, urlopen
except ImportError: #python 2
from urllib2 import Request, urlopen
googleFinanceKeyToFullName = {
u'id' : u'ID',
u't' : u'StockSymbol',
u'e' : u'Index',
u'l' : u'LastTradePrice',
u'l_cur' : u'LastTradeWithCurrency',
u'ltt' : u'LastTradeTime',
u'lt_dts' : u'LastTradeDateTime',
u'lt' : u'LastTradeDateTimeLong',
u'div' : u'Dividend',
u'yld' : u'Yield'
}
def buildUrl(symbols):
symbol_list = ','.join([symbol for symbol in symbols])
#a deprecated but still active & correct api
return 'http://finance.google.com/finance/info?client=ig&q=' \
+ symbol_list
def request(symbols):
url = buildUrl(symbols)
req = Request(url)
resp = urlopen(req)
#remove special symbols such as the pound symbol
content = resp.read().decode('ascii', 'ignore').strip()
content = content[3:]
return content
def replaceKeys(quotes):
global googleFinanceKeyToFullName
quotesWithReadableKey = []
for q in quotes:
qReadableKey = {}
for k in googleFinanceKeyToFullName:
if k in q:
qReadableKey[googleFinanceKeyToFullName[k]] = q[k]
quotesWithReadableKey.append(qReadableKey)
return quotesWithReadableKey
def getQuotes(symbols):
if type(symbols) == type('str'):
symbols = [symbols]
content = json.loads(request(symbols))
return replaceKeys(content);
if __name__ == '__main__':
try:
symbols = sys.argv[1]
except:
symbols = "GOOG,AAPL,MSFT,AMZN,SBUX"
symbols = symbols.split(',')
try:
print(json.dumps(getQuotes(symbols), indent=2))
except:
print("Fail")
You can get the last current stock price from the dictionary and put it into a variable, say price,
by changing the last part of the code to
try:
quotes = getQuotes(symbols)
price = quotes[-1]['LastTradePrice'] # -1 means last in a list
print(price)
except Exception as e:
print(e)
but it is very unreliable because if the order of prices is changed, you will get a price for a different stock.
What you should do is to learn how to define a data structure that's suitable ro solve your problem.