In the below code block, I have a dataframe, geo, which I want to iterate over to get the easting, northing, longitude and latitude for each UK postcode in geo. I've written a function to call the API and another to return the four variables.
I've tested the get_data call with a postcode to prove it works (this is a public API anyone can use):
import requests
import pandas as pd
geo = spark.table('property_address').toPandas()
def call_api(url: str) -> dict:
postcode_response =requests.get(url)
return postcode_response.json()
def get_data(postcode):
url = f"http://api.getthedata.com/postcode/{postcode}"
req = r.get(url)
results = req.json()['data']
easting = results['easting']
northing = results['northing']
latitude = results['latitude']
longitude = results ['longitude']
return easting ,northing,latitude, longitude
get_data('SW1A 1AA')
which returns:
Out[108]: (529090, 179645, '51.501009', '-0.141588')
Want I want to do is run that for each row in geo and return it as a dataset. My research has led me to apply, and I've based my attempt on this guide.
I'm trying to pass a column called property_postcode in geo and iterate each row to return the values, here's my attempt:
def get_columns(row):
column_name = 'property_postcode'
api_param = row[column_name]
easting,northing,latitude,longitude = get_data(api_param)
row['east'] = easting
row['north'] = northing
row['lat'] = latitude
row['long'] = longitude
return row
geo= geo.apply(get_columns, axis=1)
display(geo)
The error I get is
`JSONDecodeError: Expecting value: line 1 column 1 (char 0)`
Doesn't tell me a huge amount. Looking for assistance\pointers.
Instead of trying to set the values for the east, north, lat and long columns in the function return them from the function.
from numpy import result_type
import requests
import pandas as pd
# geo = spark.table('property_address').toPandas()
def call_api(url: str) -> dict:
postcode_response = requests.get(url)
return postcode_response.json()
def get_data(postcode):
url = f"http://api.getthedata.com/postcode/{postcode}"
req = requests.get(url)
if req.json()["status"] == "match":
results = req.json()["data"]
easting = results.get("easting")
northing = results.get("northing")
latitude = results.get("latitude")
longitude = results.get("longitude")
else:
easting = None
northing = None
latitude = None
longitude = None
return easting, northing, latitude, longitude
def get_columns(code):
api_param = code
return get_data(api_param)
df = pd.DataFrame(
{
"property_postcode": [
"BE21 6NZ",
"SW1A 1AA",
"W1A 1AA",
"DE21",
"B31",
"ST16 2NY",
"S65 1EN",
]
}
)
df[["east", "north", "lat", "long"]] = df.apply(
lambda row: get_columns(row["property_postcode"]), axis=1, result_type="expand"
)
print(df)
property_postcode
east
north
lat
long
BE21 6NZ
NaN
NaN
None
None
SW1A 1AA
529090
179645
51.501009
-0.141588
W1A 1AA
528887
181593
51.518561
-0.143799
DE21
NaN
NaN
None
None
B31
NaN
NaN
None
None
ST16 2NY
391913
323540
52.809346
-2.121413
S65 1EN
444830
394082
53.44163
-1.326573
Related
I have the following function:
def my_funct(Keyword, Dates, Country, Col_name):
KEYWORDS=[Keyword]
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
df_CODES= pd.DataFrame(KEYWORDS_CODES)
EXACT_KEYWORDS=df_CODES['mid'].to_list()
DATE_INTERVAL= Dates
COUNTRY=[Country] #Use this link for iso country code
CATEGORY=0 # Use this link to select categories
SEARCH_TYPE='' #default is 'web searches',others include 'images','news','youtube','froogle' (google shopping)
Individual_EXACT_KEYWORD = list(zip(*[iter(EXACT_KEYWORDS)]*1))
Individual_EXACT_KEYWORD = [list(x) for x in Individual_EXACT_KEYWORD]
dicti = {}
i = 1
for Country in COUNTRY:
for keyword in Individual_EXACT_KEYWORD:
try:
pytrend.build_payload(kw_list=keyword,
timeframe = DATE_INTERVAL,
geo = Country,
cat=CATEGORY,
gprop=SEARCH_TYPE)
dicti[i] = pytrend.interest_over_time()
i+=1
time.sleep(6)
except requests.exceptions.Timeout:
print("Timeout occured")
df_trends = pd.concat(dicti, axis=1)
df_trends.columns = df_trends.columns.droplevel(0) #drop outside header
df_trends = df_trends.drop('isPartial', axis = 1) #drop "isPartial"
df_trends.reset_index(level=0,inplace=True) #reset_index
df_trends.columns=['date', Col_name] #change column names
return df_trends
Then I call the function using:
x1 = my_funct('Unemployment', '2004-01-04 2009-01-04', 'DK', 'Unemployment (Denmark)')
Then I put that into a df:
df1 = pd.DataFrame(x1)
Once I convert that df to excel, how do I ensure that it is in YYYY-MM-DD format without the dangling 00:00:00? Anytime I convert it comes out with hours and seconds.
I tried df1 = pd.DataFrame(x1).dt.strftime('%Y-%m-%d') but it says that this cannot be used?
Please help
Thanks
You are trying pass dt.strftime on the entire dataframe, but you need to pass it on the date column:
df1['date'] = df1['date'].dt.strftime('%Y-%m-%d')
Given a small dataset df as follows:
id name address
0 1 ABC tower 北京市朝阳区
1 2 AC park 北京市海淀区
2 3 ZR hospital 上海市黄浦区
3 4 Fengtai library NaN
4 5 Square Point 上海市虹口区
I would like to obtain longitude and latidude for address column and append them to orginal dataframe. Please note there are NaNs in address column.
The code below gives me a table with addresses, longitude and latitude, but it ignores the NaN address rows, also the code should be improved:
import pandas as pd
import requests
import json
df = df[df['address'].notna()]
res = []
for addre in df['address']:
url = "http://restapi.amap.com/v3/geocode/geo?key=f057101329c0200f170be166d9b023a1&address=" + addre
dat = {
'count': "1",
}
r = requests.post(url, data = json.dumps(dat))
s = r.json()
infos = s['geocodes']
for j in range(0, 10000):
# print(j)
try:
more_infos = infos[j]
# print(more_infos)
except:
continue
try:
data = more_infos['location']
# print(data)
except:
continue
try:
lon_lat = data.split(',')
lon = float(lon_lat[0])
lat = float(lon_lat[1])
except:
continue
res.append([addre, lon, lat])
result = pd.DataFrame(res)
result.columns = ['address', 'longitude', 'latitude']
print(result)
result.to_excel('result.xlsx', index = False)
Out:
address longitude latitude
0 北京市朝阳区 116.601144 39.948574
1 北京市海淀区 116.329519 39.972134
2 上海市黄浦区 121.469240 31.229860
3 上海市虹口区 121.505133 31.264600
But how could I get the final result as follows? Thanks for your kind help at advance.
id name address longitude latitude
0 1 ABC tower 北京市朝阳区 116.601144 39.948574
1 2 AC park 北京市海淀区 116.329519 39.972134
2 3 ZR hospital 上海市黄浦区 121.469240 31.229860
3 4 Fengtai library NaN NaN NaN
4 5 Square Point 上海市虹口区 121.505133 31.264600
use pd.merge, as result is the longitude & latitude dataframe.
dfn = pd.merge(df, result, on='address', how='left')
or
for _, row in df.iterrows():
_id = row['id']
name = row['name']
addre = row['address']
if pd.isna(row['address']):
res.append([_id, name, addre, None, None])
continue
###### same code ######
url = '...'
# ...
###### same code ######
res.append([_id, name, addre, lon, lat])
result = pd.DataFrame(res)
result.columns = ['id', 'name', 'address', 'longitude', 'latitude']
print(result)
result.to_excel('result.xlsx', index = False)
Been trying to structure an API response from XML to a readable pandas dataframe. I found a lot of inspiration in earlier threads on this topic, but the values in my dataframe still displays as "None".
The XML response:
<VehiclePositionResponse xmlns="http://fms-standard.com/rfms/v1.0.0/xsd/position" xmlns:ns2="http://fms-standard.com/rfms/v1.0.0/xsd/common/position">
<VehiclePosition>
<VIN>YS2R8X40005440923</VIN>
<TriggerType>OTHER</TriggerType>
<CreatedDateTime>2019-07-31T16:50:28</CreatedDateTime>
<ReceivedDateTime>2019-07-31T16:50:29</ReceivedDateTime>
<GNSSPosition>
<ns2:Latitude>62.098339</ns2:Latitude>
<ns2:Longitude>10.542222</ns2:Longitude>
<ns2:Heading>291</ns2:Heading>
<ns2:Altitude>655</ns2:Altitude>
<ns2:Speed>0</ns2:Speed>
<ns2:PositionDateTime>2019-07-31T16:50:28</ns2:PositionDateTime>
</GNSSPosition>
<WheelBasedSpeed></WheelBasedSpeed>
</VehiclePosition>
<VehiclePosition>
<VIN>YS2R8X40005441367</VIN>
<TriggerType>OTHER</TriggerType>
<CreatedDateTime>2019-07-31T18:13:24</CreatedDateTime>
<ReceivedDateTime>2019-07-31T18:13:25</ReceivedDateTime>
<GNSSPosition>
<ns2:Latitude>62.127206</ns2:Latitude>
<ns2:Longitude>10.608676</ns2:Longitude>
<ns2:Heading>3</ns2:Heading>
etc.
Code:
headers={'Authorization':Token,'Content-Type':'application/xml'}
r=requests.get(url, headers=headers)
def getvalueofnode(node):
return node.text if node is not None else None
def main():
root = cET.fromstring(r.content)
dfcols = ['VIN', 'CreatedDateTime', 'ReceivedDateTime', 'Latitude', 'Longitude', 'Altitude']
df_xml = pd.DataFrame(columns=dfcols)
for node in root:
VIN = node.find('VIN')
CreatedDateTime = node.find('CreatedDateTime')
ReceivedDateTime = node.find('ReceivedDateTime')
Latitude = node.find('Latitude')
Longitude = node.find('Longitude')
Altitude = node.find('Altitude')
df_xml = df_xml.append(
pd.Series([getvalueofnode(VIN), getvalueofnode(CreatedDateTime), getvalueofnode(ReceivedDateTime), getvalueofnode(Latitude), getvalueofnode(Longitude), getvalueofnode(Altitude)], index=dfcols),
ignore_index=True)
print(df_xml)
main()
This is how my response looks like:
Current Dataframe
Essentially, you are not accounting for the namespaces in the XML located in root tag and likely the reason for all None results. Consider parsing with namespaces defined. Since one is the default namespace, give it any prefix like data and parse with it:
ns = {"doc":"http://fms-standard.com/rfms/v1.0.0/xsd/position",
"ns2":"http://fms-standard.com/rfms/v1.0.0/xsd/common/position"}
for node in root:
VIN = node.find("doc:VIN", ns)
CreatedDateTime = node.find('doc:CreatedDateTime', ns)
ReceivedDateTime = node.find('doc:ReceivedDateTime', ns)
Latitude = node.find('doc:GNSSPosition/ns2:Latitude', ns)
Longitude = node.find('doc:GNSSPosition/ns2:Longitude', ns)
Altitude = node.find('doc:GNSSPosition/ns2:Altitude', ns)
Additionally, avoid the quadratic copy of calling append in a loop. Instead, build a list of dictionaries to bind into a DataFrame() constructor.
def main2():
root = cET.fromstring(r.content)
ns = {"doc":"http://fms-standard.com/rfms/v1.0.0/xsd/position",
"ns2":"http://fms-standard.com/rfms/v1.0.0/xsd/common/position"}
data_list = [{'VIN': getvalueofnode(node.find("doc:VIN", ns)),
'CreatedDateTime': getvalueofnode(node.find('doc:CreatedDateTime', ns)),
'ReceivedDateTime': getvalueofnode(node.find('doc:ReceivedDateTime', ns)),
'Latitude': getvalueofnode(node.find('doc:GNSSPosition/ns2:Latitude', ns)),
'Longitude': getvalueofnode(node.find('doc:GNSSPosition/ns2:Longitude', ns)),
'Altitude': getvalueofnode(node.find('doc:GNSSPosition/ns2:Altitude', ns))} \
for node in root]
df_xml = pd.DataFrame(data_list)
Output
print(df_xml)
# Altitude CreatedDateTime Latitude Longitude ReceivedDateTime VIN
# 0 655 2019-07-31T16:50:28 62.098339 10.542222 2019-07-31T16:50:29 YS2R8X40005440923
# 1 None 2019-07-31T18:13:24 62.127206 10.608676 2019-07-31T18:13:25 YS2R8X40005441367
I retrieve data from quandl and load it to a pandas DF object.
Afterwards I calculate SMA values (SMA21, SMA55) based on "Last Price".
Adding those SMA values as a column do my DF object.
I iterate through DF to catch a buy signal.
I know the buy condition is holding true for some dates but my code does not printing anything out. I am expecting to print the buy condition at the very least.
as below you can see the following condition:
kitem['SMA21'] >= kitem['Last']
My code:
import requests
import pandas as pd
import json
class URL_Params:
def __init__ (self, endPoint, symboll, startDate, endDate, apiKey):
self.endPoint = endPoint
self.symboll = symboll
self.startDate = startDate
self.endDate = endDate
self.apiKey = apiKey
def createURL (self):
return self.endPoint + self.symboll + '?start_date=' + self.startDate + '&end_date=' + self.endDate + '&api_key=' + self.apiKey
def add_url(self, _url):
self.url_list
my_portfolio = {'BTC':1.0, 'XRP':0, 'DSH':0, 'XMR':0, 'TotalBTCValue':1.0}
_endPoint = 'https://www.quandl.com/api/v3/datasets/BITFINEX/'
_symbolls = ['BTCEUR','XRPBTC','DSHBTC','IOTBTC','XMRBTC']
_startDate = '2017-01-01'
_endDate = '2019-03-01'
_apiKey = '' #needs to be set for quandl
my_data = {}
my_conns = {}
my_col_names = ['Date', 'High', 'Low', 'Mid', 'Last', 'Bid', 'Ask', 'Volume']
orderbook = []
#create connection and load data for each pair/market.
#load them in a dict for later use
for idx_symbol in _symbolls:
my_url_params = URL_Params(_endPoint,idx_symbol,_startDate,_endDate,_apiKey)
response = requests.get(my_url_params.createURL())
my_data[idx_symbol] = json.loads(response.text)
#Prepare Data
my_raw_data_df_xrpbtc = pd.DataFrame(my_data['XRPBTC']['dataset']['data'], columns= my_data['XRPBTC']['dataset']['column_names'])
#Set Index to Date Column and Sort
my_raw_data_df_xrpbtc['Date'] = pd.to_datetime(my_raw_data_df_xrpbtc['Date'])
my_raw_data_df_xrpbtc.index = my_raw_data_df_xrpbtc['Date']
my_raw_data_df_xrpbtc = my_raw_data_df_xrpbtc.sort_index()
#Drop unrelated columns
my_raw_data_df_xrpbtc.drop(['Date'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Ask'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Bid'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Low'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['High'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Mid'], axis=1, inplace=True)
#Calculate SMA values to create buy-sell signal
my_raw_data_df_xrpbtc['SMA21'] = my_raw_data_df_xrpbtc['Last'].rolling(21).mean()
my_raw_data_df_xrpbtc['SMA55'] = my_raw_data_df_xrpbtc['Last'].rolling(55).mean()
my_raw_data_df_xrpbtc['SMA200'] = my_raw_data_df_xrpbtc['Last'].rolling(200).mean()
#Check for each day if buy signal holds BUY if sell signal holds SELL
for idx,kitem in my_raw_data_df_xrpbtc.iterrows():
if (kitem['SMA21'] >= kitem['Last']) is True: #buy signal
print("buy0")
if my_portfolio['BTC'] > 0 is True:
print("buy1")
if (kitem['Last'] * my_portfolio['XRP']) >= (my_portfolio['BTC'] * 1.05) is True: #sell signal
print("sell0")
if my_portfolio['XRP'] > 0 is True:
print("sell1")
I know that there are lots of rows that holds true but my code never enters this path of code so it does not print out what I expect.
Could anyone please help/comment what might be wrong?
The reason is that your comparison is wrong. The result of kitem['SMA21'] >= kitem['Last'] will be a numpy.bool_. When you use is to compare it to True this will fail as it is not the same object.
If you change the comparison to == it will work as expected:
if (kitem['SMA21'] >= kitem['Last']) == True:
I have a dataframe with addresses in a column and I am using the code below to extract longitude and latitude separately. Is there a way to extract longitude and latitude together as well as extract city using this same approach?
In address column of my "add" dataframe, I have addresses in the following format: "35 Turkey Hill Rd Rt 21 Ste 101,Belchertown, MA 01007"
I am using Python 3, Spyder IDE and Windows 10 desktop.
Sample data:
Address
415 E Main St, Westfield, MA 01085
200 Silver St, Agawam, MA 01001
35 Turkey Hill Rd Rt 21 Ste 101,Belchertown, MA 01007
from geopy.geocoders import Nominatim, ArcGIS, GoogleV3
#from geopy.exc import GeocoderTimedOut
arc=ArcGIS(timeout=100)
nom = Nominatim(timeout=100)
goo = GoogleV3(timeout=100)
geocoders = [arc,nom, goo]
# Capture Longitude
def geocodelng(address):
i = 0
try:
while i < len(geocoders):
# try to geocode using a service
location = geocoders[i].geocode(address)
# if it returns a location
if location != None:
# return those values
return location.longitude
else:
# otherwise try the next one
i += 1
except:
# catch whatever errors, likely timeout, and return null values
print sys.exc_info()[0]
return ['null','null']
# if all services have failed to geocode, return null values
return ['null','null']
#Extract co-ordinates
add['longitude']=add['Address'].apply(geocodelng)
# Capture Latitude
def geocodelat(address):
i = 0
try:
while i < len(geocoders):
# try to geocode using a service
location = geocoders[i].geocode(address)
# if it returns a location
if location != None:
# return those values
return location.latitude
else:
# otherwise try the next one
i += 1
except:
# catch whatever errors, likely timeout, and return null values
print sys.exc_info()[0]
return ['null','null']
# if all services have failed to geocode, return null values
return ['null','null']
#Extract co-ordinates
add['latitude']=add['Address'].apply(geocodelat)