Run only if geo-info available in python - python

Here is a snippet of the code:
import flickrapi
api_key = "xxxxxxxxxxxxxxxxxxxxx"
secret_api_key = "xxxxxxxxxx"
flickr = flickrapi.FlickrAPI(api_key, secret_api_key)
def obtainImages3():
group_list = flickr.groups.search (api_key=api_key, text = 'Paris', per_page = 10)
for group in group_list[0]:
group_images = flickr.groups.pools.getPhotos (api_key=api_key, group_id = group.attrib['nsid'], extras = 'geo, tags, url_s')
for image in group_images[0]:
url = image.attrib['url_s']
tags = image.attrib['tags']
if image.attrib['geo'] != 'null':
photo_location = flickr.photos_geo_getLocation(photo_id=image.attrib['id'])
lat = float(photo_location[0][0].attrib['latitude'])
lon = float(photo_location[0][0].attrib['longitude'])
I want to get information about images if and only if they have a geo-tag connected to them. I tried to do this with the line if image.attrib['geo'] != 'null' but I don't think this works. Can anyone suggest a way I might be able to do it, thanks in advance!

Replace your if image.attrib['geo']!='null' condition with a try and exception block as below.
Since the API returns the data in JSON format you can check the presence of key using:
try:
image.attrib['geo']
photo_location=flickr.photos_geo_getLocation(photo_id=image.attrib['id'])
lat = float(photo_location[0][0].attrib['latitude'])
lon = float(photo_location[0][0].attrib['longitude'])
except KeyError:
pass

Related

Python Streamlit, and yfinance issues

I'll just list the two bugs I know as of now, and if you have any recommendations for refactoring my code let me know I'll go ahead and list out the few known issues as of now.
yfinance is not appending the dividendYield to my dict, I did make sure that their is an actual Dividend Yield for those Symbols.
TypeError: can only concatenate str (not "Tag") to str which I assume is something to do with how it parsing through the xml, and it ran into a tag so I am not able to create the expander, I thought I could solve it with this if statement, but instead I just don't get any expander at all.
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
Full code for main.py:
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
# Get the entities from each heading, link it with nasdaq data // if possible, and Extract market data with yfinance.
stock_dict = {
'Org': [],
'Symbol': [],
'currentPrice': [],
'dayHigh': [],
'dayLow': [],
'forwardPE': [],
'dividendYield': []
}
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
stock_info = yf.Ticker(symbol).info
print(symbol)
stock_dict['Org'].append(org_name)
stock_dict['Symbol'].append(symbol)
stock_dict['currentPrice'].append(
stock_info['currentPrice'])
stock_dict['dayHigh'].append(stock_info['dayHigh'])
stock_dict['dayLow'].append(stock_info['dayLow'])
stock_dict['forwardPE'].append(stock_info['forwardPE'])
stock_dict['dividendYield'].append(
stock_info['dividendYield'])
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame.from_dict(stock_dict, orient='index')
output_df = output_df.transpose()
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
print(fin_headings)
# Output financial info
output_df = stock_info(fin_headings)
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
There is an issue in your logic in stock_info function because of which same symbol is getting different values and when you are cleaning the duplicate, based on occurrence of the symbol its retaining the row with first occurrence of symbol.
The below code will solve both of your issues.
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
stock_info_list = []
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
print(symbol)
stock_info = yf.Ticker(symbol).info
stock_info['Org'] = org_name
stock_info['Symbol'] = symbol
stock_info_list.append(stock_info)
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame(stock_info_list)
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
output_df = stock_info(fin_headings)
output_df = output_df[['Org','Symbol','currentPrice','dayHigh','dayLow','forwardPE','dividendYield']]
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
heading = heading.text
if type(heading) == str:
st.markdown("* " + heading)
else:
pass
For issue #2 the patch code that you posted has a small mistake. Rather than checking if heading == str, which does something completely different than you intended and will always be False, you want to check if isinstance(heading, str). That way you get True if heading is a string and False if not. However, even then, it should not be a solution as heading is not a string. Instead you want to call get_text on heading to get the actual text part of the parsed object.
heading.get_text()
More information would be needed to solve issue #1. What does stock_dict look like before you create the Dataframe out of it? Specifically, what values are in stock_dict['dividendYield']? Can you print it and add it to your question?
Also, about the refactoring part. An
else:
pass
block does completely nothing and should be deleted. (When the if condition is false nothing happens anyways)

Code efficiency/performance improvement in Pushshift Reddit web scraping loop

I am extracting Reddit data via the Pushshift API. More precisely, I am interested in comments and posts (submissions) in subreddit X with search word Y, made from now until datetime Z (e.g. all comments mentioning "GME" in subreddit /rwallstreetbets). All these parameters can be specified. So far, I got it working with the following code:
import pandas as pd
import requests
from datetime import datetime
import traceback
import time
import json
import sys
import numpy as np
username = "" # put the username you want to download in the quotes
subreddit = "gme" # put the subreddit you want to download in the quotes
search_query = "gamestop" # put the word you want to search for (present in comment or post) in the quotes
# leave either one blank to download an entire user's, subreddit's, or search word's history
# or fill in all to download a specific users history from a specific subreddit mentioning a specific word
filter_string = None
if username == "" and subreddit == "" and search_query == "":
print("Fill in either username or subreddit")
sys.exit(0)
elif username == "" and subreddit != "" and search_query == "":
filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "" and search_query == "":
filter_string = f"author={username}"
elif username == "" and subreddit != "" and search_query != "":
filter_string = f"subreddit={subreddit}&q={search_query}"
elif username == "" and subreddit == "" and search_query != "":
filter_string = f"q={search_query}"
else:
filter_string = f"author={username}&subreddit={subreddit}&q={search_query}"
url = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before="
start_time = datetime.utcnow()
def redditAPI(object_type):
global df_comments
df_comments = pd.DataFrame(columns=["date", "comment", "score", "id"])
global df_posts
df_posts = pd.DataFrame(columns=["date", "post", "score", "id"])
print(f"\nLooping through {object_type}s and append to dataframe...")
count = 0
previous_epoch = int(start_time.timestamp())
while True:
# Ensures that loop breaks at March 16 2021 for testing purposes
if previous_epoch <= 1615849200:
break
new_url = url.format(object_type, filter_string)+str(previous_epoch)
json_text = requests.get(new_url)
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json.loads(json_text.text)
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
df2 = pd.DataFrame.from_dict(objects)
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == "comment":
df2.rename(columns={'created_utc': 'date', 'body': 'comment'}, inplace=True)
df_comments = df_comments.append(df2[['date', 'comment', 'score']])
elif object_type == "submission":
df2.rename(columns={'created_utc': 'date', 'selftext': 'post'}, inplace=True)
df_posts = df_posts.append(df2[['date', 'post', 'score']])
# Convert UNIX to datetime
df_comments["date"] = pd.to_datetime(df_comments["date"],unit='s')
df_posts["date"] = pd.to_datetime(df_posts["date"],unit='s')
# Drop blank rows (the case when posts only consists of an image)
df_posts['post'].replace('', np.nan, inplace=True)
df_posts.dropna(subset=['post'], inplace=True)
# Drop duplicates (see last comment on https://www.reddit.com/r/pushshift/comments/b7onr6/max_number_of_results_returned_per_query/)
df_comments = df_comments.drop_duplicates()
df_posts = df_posts.drop_duplicates()
print("\nDone. Saved to dataframe.")
Unfortunately, I do have some performance issues. Due to the fact that I paginate based on created_utc - 1 (and since I do not want to miss any comments/posts), the initial dataframe will contain duplicates (since there won't be 100 (=API limit) new comments/posts every new second). If I run the code for a long time frame (e.g. current time - 1 March 2021), this will result in a huge dataframe which takes considerably long to process.
As the code is right now, the duplicates are added to the dataframe and only after the loop, they are removed. Is there a way to make this more efficient? E.g. to check within the for loop whether the object already exists in the dataframe? Would this make a difference, performance wise? Any input would be very much appreciated.
It is possible to query the data so that there are no duplicates in the first place.
You are using the before parameter of the API, allowing to get only records strictly before the timestamp. This means we can send as before on each iteration the timestamp of the earliest record that we already have. In this case in response we are only gonna have records that we haven't seen yet, so no duplicates.
In code that would look something like this:
import pandas as pd
import requests
import urllib
import time
import json
def get_data(object_type, username='', subreddit='', search_query='', max_time=None, min_time=1615849200):
# start from current time if not specified
if max_time is None:
max_time = int(time.time())
# generate filter string
filter_string = urllib.parse.urlencode(
{k: v for k, v in zip(
['author', 'subreddit', 'q'],
[username, subreddit, search_query]) if v != ""})
url_format = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before={}"
before = max_time
df = pd.DataFrame()
while before > min_time:
url = url_format.format(object_type, filter_string, before)
resp = requests.get(url)
# convert records to dataframe
dfi = pd.json_normalize(json.loads(resp.text)['data'])
if object_type == 'comment':
dfi = dfi.rename(columns={'created_utc': 'date', 'body': 'comment'})
df = pd.concat([df, dfi[['id', 'date', 'comment', 'score']]])
elif object_type == 'submission':
dfi = dfi.rename(columns={'created_utc': 'date', 'selftext': 'post'})
dfi = dfi[dfi['post'].ne('')]
df = pd.concat([df, dfi[['id', 'date', 'post', 'score']]])
# set `before` to the earliest comment/post in the results
# next time we call requests.get(...) we will only get comments/posts before
# the earliest that we already have, thus not fetching any duplicates
before = dfi['date'].min()
# if needed
# time.sleep(1)
return df
Testing by getting the comments and checking for duplicate values (by id):
username = ""
subreddit = "gme"
search_query = "gamestop"
df_comments = get_data(
object_type='comment',
username=username,
subreddit=subreddit,
search_query=search_query)
df_comments['id'].duplicated().any() # False
df_comments['id'].nunique() # 2200
I would suggest a bloom filter to check if values have already been passed through.
There is a package on PyPi, which implements this very easily. To use the bloom filter you just have to add a "key" to the filter, this can be a combination of the username and comment. This way you can check if you have already added comments to your data frame. I suggest that you use the bloom filter as early as possible in your method, i.e. after you get a response from the API.

How to Filter json data with python

I am trying to create a function that filters json data pulled from the Google places api. I want it to return the name of a business and the types values if the name contains the string "Body Shop" and the types are ['car_repair', 'point_of_interest', 'establishment'] otherwise I want it to reject the result. Here is my code so far. I have tried and tried and can't seem to figure out a way to store certain criteria to make the search easier.
import googlemaps
import pprint
import time
import urllib.request
API_KEY = 'YOUR_API_KEY'
lat, lng = 40.35003, -111.95206
#define our Client
gmaps = googlemaps.Client(key = API_KEY)
#Define our Search
places_result = gmaps.places_nearby(location= "40.35003,-111.95206", radius= 40000,open_now= False,type= ['car_repair','point_of_interest','establishment'])
#pprint.pprint(places_result['results'])
time.sleep(3)
places_result_2 = gmaps.places_nearby(page_token =
places_result['next_page_token'])
pprint.pprint(places_result_2['results'])
places_result_2 = gmaps.places_nearby(page_token =
places_result['next_page_token'])
types = place_details['result']['types']
name = place_details['result']['name']
def match(types,name):
for val in types:
'car_repair','point_of_interest','establishment' in val and "Body Shop" in name
print(name,types)
Try this:
import googlemaps
import pprint
import time
import urllib.request
API_KEY = 'YOUR_API_KEY'
lat, lng = 40.35003, -111.95206
#define our Client
gmaps = googlemaps.Client(key = API_KEY)
#Define our Search
places_result = gmaps.places_nearby(location= "40.35003,-111.95206", radius= 40000,open_now= False,type= ['car_repair','point_of_interest','establishment'])
#heres how to retrieve the name of the first result
example_of_name = places_result['results'][0]['name']
print(example_of_name)
#gets places name and type for all the results
for place in places_result['results']:
print("Name of Place:")
print(place['name'])
print("Type of the place:")
print(place['types'], "\n")

Why won't my program accept a Pandas data series as input?

Working on this for a school project. I'm basically scraping IP addresses off of the wikipedia history. I'm then running the IP addresses through the ipstack.com API and getting lat and long. I'm then trying to push the lat and long to the opencage API but here is where I am running into issue. If I hard code a lat and long into this it returns a city.
result = geocoder.opencage([latitude, longitude], key=key, method='reverse')
print(result.city)
But when I try to loop through a lat and long list I get an error
TypeError: cannot convert the series to <class 'float'>
I'm thinking this might have to do with series type but then again I might be going about it completely wrong. Any ideas?
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
import pandas as pd
import re
from opencage.geocoder import OpenCageGeocode
import geocoder
response = requests.get("https://en.wikipedia.org/w/index.php?title=Gun_laws_in_New_Hampshire&action=history")
soup = BeautifulSoup(response.text, "lxml")
bdi_text = []
for bdi_tag in soup.find_all('bdi'):
bdi_text.append(bdi_tag.text)
ip_addresses = []
for element in bdi_text:
ip = re.findall( r'[0-9]+(?:\.[0-9]+){3}', element)
if len(ip) > 0:
ip_addresses.append(ip)
api_key = '?access_key={YOUR_API_ACCESS_KEY}'
resolved_ips = []
for ips in ip_addresses:
api_call = requests.get('http://api.ipstack.com/' + ips[0] + api_key).json()
resolved_ips.append(api_call)
ip_df = pd.DataFrame.from_records(resolved_ips)
ip_df = ip_df[['city','country_code','latitude','longitude']]
key = 'my_API_key'
latitude = ip_df['latitude']
longitude = ip_df['longitude']
result = []
print(len(latitude))
for latlong in range(0,len(latitude)):
result = geocoder.opencage([latitude, longitude], key=key, method='reverse')
print(result.city)
Your implementation is rough. I would do something like this
def make_city(row):
result = geocoder.opencage(float(row['latitude']), #lat of target
float(row['longitude']), #long of target
key=key, #API key that I will keep to myself
method='reverse')
print(result.city)
ip_df.apply(make_city, axis = 1)
I think its getting confused with the type you are passing:
Not sure of the exact structure of your data, but try this instead:
latitude = ip_df['latitude'].astype(float)
longitude = ip_df['longitude'].astype(float)

Python How to retrieve a stock's last current stock price from the dictionary and put it into a variable?

I am trying to obtain a stock's current price, and then put it into a variable to run if / else statements on. I have used the Google API to retrieve current stock prices, but I am unable to figure out how to put it into a variable. Thanks!
import json
import sys
try:
from urllib.request import Request, urlopen
except ImportError: #python 2
from urllib2 import Request, urlopen
googleFinanceKeyToFullName = {
u'id' : u'ID',
u't' : u'StockSymbol',
u'e' : u'Index',
u'l' : u'LastTradePrice',
u'l_cur' : u'LastTradeWithCurrency',
u'ltt' : u'LastTradeTime',
u'lt_dts' : u'LastTradeDateTime',
u'lt' : u'LastTradeDateTimeLong',
u'div' : u'Dividend',
u'yld' : u'Yield'
}
def buildUrl(symbols):
symbol_list = ','.join([symbol for symbol in symbols])
#a deprecated but still active & correct api
return 'http://finance.google.com/finance/info?client=ig&q=' \
+ symbol_list
def request(symbols):
url = buildUrl(symbols)
req = Request(url)
resp = urlopen(req)
#remove special symbols such as the pound symbol
content = resp.read().decode('ascii', 'ignore').strip()
content = content[3:]
return content
def replaceKeys(quotes):
global googleFinanceKeyToFullName
quotesWithReadableKey = []
for q in quotes:
qReadableKey = {}
for k in googleFinanceKeyToFullName:
if k in q:
qReadableKey[googleFinanceKeyToFullName[k]] = q[k]
quotesWithReadableKey.append(qReadableKey)
return quotesWithReadableKey
def getQuotes(symbols):
if type(symbols) == type('str'):
symbols = [symbols]
content = json.loads(request(symbols))
return replaceKeys(content);
if __name__ == '__main__':
try:
symbols = sys.argv[1]
except:
symbols = "GOOG,AAPL,MSFT,AMZN,SBUX"
symbols = symbols.split(',')
try:
print(json.dumps(getQuotes(symbols), indent=2))
except:
print("Fail")
You can get the last current stock price from the dictionary and put it into a variable, say price,
by changing the last part of the code to
try:
quotes = getQuotes(symbols)
price = quotes[-1]['LastTradePrice'] # -1 means last in a list
print(price)
except Exception as e:
print(e)
but it is very unreliable because if the order of prices is changed, you will get a price for a different stock.
What you should do is to learn how to define a data structure that's suitable ro solve your problem.

Categories

Resources