I have a dataframe with addresses in a column and I am using the code below to extract longitude and latitude separately. Is there a way to extract longitude and latitude together as well as extract city using this same approach?
In address column of my "add" dataframe, I have addresses in the following format: "35 Turkey Hill Rd Rt 21 Ste 101,Belchertown, MA 01007"
I am using Python 3, Spyder IDE and Windows 10 desktop.
Sample data:
Address
415 E Main St, Westfield, MA 01085
200 Silver St, Agawam, MA 01001
35 Turkey Hill Rd Rt 21 Ste 101,Belchertown, MA 01007
from geopy.geocoders import Nominatim, ArcGIS, GoogleV3
#from geopy.exc import GeocoderTimedOut
arc=ArcGIS(timeout=100)
nom = Nominatim(timeout=100)
goo = GoogleV3(timeout=100)
geocoders = [arc,nom, goo]
# Capture Longitude
def geocodelng(address):
i = 0
try:
while i < len(geocoders):
# try to geocode using a service
location = geocoders[i].geocode(address)
# if it returns a location
if location != None:
# return those values
return location.longitude
else:
# otherwise try the next one
i += 1
except:
# catch whatever errors, likely timeout, and return null values
print sys.exc_info()[0]
return ['null','null']
# if all services have failed to geocode, return null values
return ['null','null']
#Extract co-ordinates
add['longitude']=add['Address'].apply(geocodelng)
# Capture Latitude
def geocodelat(address):
i = 0
try:
while i < len(geocoders):
# try to geocode using a service
location = geocoders[i].geocode(address)
# if it returns a location
if location != None:
# return those values
return location.latitude
else:
# otherwise try the next one
i += 1
except:
# catch whatever errors, likely timeout, and return null values
print sys.exc_info()[0]
return ['null','null']
# if all services have failed to geocode, return null values
return ['null','null']
#Extract co-ordinates
add['latitude']=add['Address'].apply(geocodelat)
Related
In the below code block, I have a dataframe, geo, which I want to iterate over to get the easting, northing, longitude and latitude for each UK postcode in geo. I've written a function to call the API and another to return the four variables.
I've tested the get_data call with a postcode to prove it works (this is a public API anyone can use):
import requests
import pandas as pd
geo = spark.table('property_address').toPandas()
def call_api(url: str) -> dict:
postcode_response =requests.get(url)
return postcode_response.json()
def get_data(postcode):
url = f"http://api.getthedata.com/postcode/{postcode}"
req = r.get(url)
results = req.json()['data']
easting = results['easting']
northing = results['northing']
latitude = results['latitude']
longitude = results ['longitude']
return easting ,northing,latitude, longitude
get_data('SW1A 1AA')
which returns:
Out[108]: (529090, 179645, '51.501009', '-0.141588')
Want I want to do is run that for each row in geo and return it as a dataset. My research has led me to apply, and I've based my attempt on this guide.
I'm trying to pass a column called property_postcode in geo and iterate each row to return the values, here's my attempt:
def get_columns(row):
column_name = 'property_postcode'
api_param = row[column_name]
easting,northing,latitude,longitude = get_data(api_param)
row['east'] = easting
row['north'] = northing
row['lat'] = latitude
row['long'] = longitude
return row
geo= geo.apply(get_columns, axis=1)
display(geo)
The error I get is
`JSONDecodeError: Expecting value: line 1 column 1 (char 0)`
Doesn't tell me a huge amount. Looking for assistance\pointers.
Instead of trying to set the values for the east, north, lat and long columns in the function return them from the function.
from numpy import result_type
import requests
import pandas as pd
# geo = spark.table('property_address').toPandas()
def call_api(url: str) -> dict:
postcode_response = requests.get(url)
return postcode_response.json()
def get_data(postcode):
url = f"http://api.getthedata.com/postcode/{postcode}"
req = requests.get(url)
if req.json()["status"] == "match":
results = req.json()["data"]
easting = results.get("easting")
northing = results.get("northing")
latitude = results.get("latitude")
longitude = results.get("longitude")
else:
easting = None
northing = None
latitude = None
longitude = None
return easting, northing, latitude, longitude
def get_columns(code):
api_param = code
return get_data(api_param)
df = pd.DataFrame(
{
"property_postcode": [
"BE21 6NZ",
"SW1A 1AA",
"W1A 1AA",
"DE21",
"B31",
"ST16 2NY",
"S65 1EN",
]
}
)
df[["east", "north", "lat", "long"]] = df.apply(
lambda row: get_columns(row["property_postcode"]), axis=1, result_type="expand"
)
print(df)
property_postcode
east
north
lat
long
BE21 6NZ
NaN
NaN
None
None
SW1A 1AA
529090
179645
51.501009
-0.141588
W1A 1AA
528887
181593
51.518561
-0.143799
DE21
NaN
NaN
None
None
B31
NaN
NaN
None
None
ST16 2NY
391913
323540
52.809346
-2.121413
S65 1EN
444830
394082
53.44163
-1.326573
I try to get the data from pyOWM package using city name but in some cases because of city typo error
not getting data & it breaks the process.
I want to get the weather data using lat-long but don't know how to set function for it.
Df1:
-----
User City State Zip Lat Long
-----------------------------------------------------------------------------
A Kuala Lumpur Wilayah Persekutuan 50100 5.3288907 103.1344397
B Dublin County Dublin NA 50.2030506 14.5509842
C Oconomowoc NA NA 53.3640384 -6.1953066
D Mumbai Maharashtra 400067 19.2177166 72.9708833
E Mratin Stredocesky kraj 250 63 40.7560585 -5.6924778
.
.
.
----------------------------------
Code:
--------
import time
from tqdm.notebook import tqdm
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
cities = Df1["City"].unique().tolist()
cities1 = cities [:5]
owm = pyowm.OWM('bee8db7d50a4b777bfbb9f47d9beb7d0')
mgr = owm.weather_manager()
'''
Step-1 Define list where save the data
'''
list_wind_Speed =[]
list_tempreture =[]
list_max_temp =[]
list_min_temp =[]
list_humidity =[]
list_pressure =[]
list_city = []
list_cloud=[]
list_status =[]
list_rain =[]
'''
Step-2 Fetch data
'''
j=0
for city in tqdm(cities1):
j=+1
if j < 60:
# one_call_obs = owm.weather_at_coords(52.5244, 13.4105).weather
# one_call_obs.current.humidity
observation = mgr.weather_at_place(city)
l = observation.weather
list_city.append(city)
list_wind_Speed.append(l.wind()['speed'])
list_tempreture.append(l.temperature('celsius')['temp'])
list_max_temp.append(l.temperature('celsius')['temp_max'])
list_min_temp.append(l.temperature('celsius')['temp_min'])
list_humidity.append(l.humidity)
list_pressure.append(l.pressure['press'])
list_cloud.append(l.clouds)
list_rain.append(l.rain)
else:
time.sleep(60)
j=0
'''
Step-3 Blank data frame and store data in that
'''
df2 = pd.DataFrame()
df2["City"] = list_city
df2["Temp"] = list_tempreture
df2["Max_Temp"] = list_max_temp
df2["Min_Temp"] = list_min_temp
df2["Cloud"] = list_cloud
df2["Humidity"] = list_humidity
df2["Pressure"] = list_pressure
df2["Status"] = list_status
df2["Rain"] = list_status
df2
From the above code, I get the result as below,
City | Temp |Max_Temp|Min_Temp|Cloud |Humidity|Pressure |Status | Rain
------------------------------------------------------------------------------------------
Kuala Lumpur|29.22 |30.00 |27.78 | 20 |70 |1007 | moderate rain | moderate rain
Dublin |23.12 |26.43 |22.34 | 15 |89 | 978 | cloudy | cloudy
...
Now because of some city typo error processes getting stop,
Looking for an alternate solution of it and try to get weather data from Lat-Long but don't know how to set function for pass lat & long column data.
Df1 = {'User':['A','B','C','D','E'],
'City':['Kuala Lumpur','Dublin','Oconomowoc','Mumbai','Mratin'],
'State':['Wilayah Persekutuan','County Dublin',NA,1'Maharashtra','Stredocesky kraj'],
'Zip': [50100,NA,NA,400067,250 63],
'Lat':[5.3288907,50.2030506,53.3640384,19.2177166,40.7560585],
'Long':[103.1344397,14.5509842,-6.1953066,72.9708833,-5.6924778]}
# Try to use this code to get wather data
# one_call_obs = owm.weather_at_coords(52.5244, 13.4105).weather
# one_call_obs.current.humidity
Expected Result
--------------
User | City | Lat | Long | Temp | Cloud | Humidity | Pressure | Rain | Status
-----------------------------------------------------------------------------
Catch the error if a city is not found, parse the lat/lon from the dataframe. Use that lat/lon to create a bounding box and use weather_at_places_in_bbox to get a list of observations in that area.
import time
from tqdm.notebook import tqdm
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
import pandas as pd
from pyowm.commons.exceptions import NotFoundError, ParseAPIResponseError
df1 = pd.DataFrame({'City': ('Kuala Lumpur', 'Dublin', 'Oconomowoc', 'Mumbai', 'C airo', 'Mratin'),
'Lat': ('5.3288907', '50.2030506', '53.3640384', '19.2177166', '30.22', '40.7560585'),
'Long': ('103.1344397', '14.5509842', '-6.1953066', '72.9708833', '31', '-5.6924778')})
cities = df1["City"].unique().tolist()
owm = pyowm.OWM('bee8db7d50a4b777bfbb9f47d9beb7d0')
mgr = owm.weather_manager()
for city in cities:
try:
observation = mgr.weather_at_place(city)
# print(city, observation)
except NotFoundError:
# get city by lat/lon
lat_top = float(df1.loc[df1['City'] == city, 'Lat'])
lon_left = float(df1.loc[df1['City'] == city, 'Long'])
lat_bottom = lat_top - 0.3
lon_right = lon_left + 0.3
try:
observations = mgr.weather_at_places_in_bbox(lon_left, lat_bottom, lon_right, lat_top, zoom=5)
observation = observations[0]
except ParseAPIResponseError:
raise RuntimeError(f"Couldn't find {city} at lat: {lat_top} / lon: {lon_right}, try tweaking the bounding box")
weather = observation.weather
temp = weather.temperature('celsius')['temp']
print(f"The current temperature in {city} is {temp}")
Given a small dataset df as follows:
id name address
0 1 ABC tower 北京市朝阳区
1 2 AC park 北京市海淀区
2 3 ZR hospital 上海市黄浦区
3 4 Fengtai library NaN
4 5 Square Point 上海市虹口区
I would like to obtain longitude and latidude for address column and append them to orginal dataframe. Please note there are NaNs in address column.
The code below gives me a table with addresses, longitude and latitude, but it ignores the NaN address rows, also the code should be improved:
import pandas as pd
import requests
import json
df = df[df['address'].notna()]
res = []
for addre in df['address']:
url = "http://restapi.amap.com/v3/geocode/geo?key=f057101329c0200f170be166d9b023a1&address=" + addre
dat = {
'count': "1",
}
r = requests.post(url, data = json.dumps(dat))
s = r.json()
infos = s['geocodes']
for j in range(0, 10000):
# print(j)
try:
more_infos = infos[j]
# print(more_infos)
except:
continue
try:
data = more_infos['location']
# print(data)
except:
continue
try:
lon_lat = data.split(',')
lon = float(lon_lat[0])
lat = float(lon_lat[1])
except:
continue
res.append([addre, lon, lat])
result = pd.DataFrame(res)
result.columns = ['address', 'longitude', 'latitude']
print(result)
result.to_excel('result.xlsx', index = False)
Out:
address longitude latitude
0 北京市朝阳区 116.601144 39.948574
1 北京市海淀区 116.329519 39.972134
2 上海市黄浦区 121.469240 31.229860
3 上海市虹口区 121.505133 31.264600
But how could I get the final result as follows? Thanks for your kind help at advance.
id name address longitude latitude
0 1 ABC tower 北京市朝阳区 116.601144 39.948574
1 2 AC park 北京市海淀区 116.329519 39.972134
2 3 ZR hospital 上海市黄浦区 121.469240 31.229860
3 4 Fengtai library NaN NaN NaN
4 5 Square Point 上海市虹口区 121.505133 31.264600
use pd.merge, as result is the longitude & latitude dataframe.
dfn = pd.merge(df, result, on='address', how='left')
or
for _, row in df.iterrows():
_id = row['id']
name = row['name']
addre = row['address']
if pd.isna(row['address']):
res.append([_id, name, addre, None, None])
continue
###### same code ######
url = '...'
# ...
###### same code ######
res.append([_id, name, addre, lon, lat])
result = pd.DataFrame(res)
result.columns = ['id', 'name', 'address', 'longitude', 'latitude']
print(result)
result.to_excel('result.xlsx', index = False)
I am supposed to get certain information from a .txt file and output it. This is the information I need:
State with the maximum population
State with the minimum population
Average state population
State of Texas population
The DATA looks like:
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
This is my code that does not really do anything I need it to do:
def main():
# Open the StateCensus2010.txt file.
census_file = open('StateCensus2010.txt', 'r')
# Read the state name
state_name = census_file.readline()
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
print('State Name: ', state_name)
print('State Abv.: ', state_abv)
print('Population: ', population)
print()
state_name = census_file.readline()
census_file.close()
main()
All I have it doing is reading the state name, abv and converting the population into an int. I don't need it to do anything of that, however I'm unsure how to do what the assignment is asking. Any hints would definitely be appreciated! I've been trying some things for the past few hours to no avail.
Update:
This is my updated code however I'm receving the following error:
Traceback (most recent call last):
File "main.py", line 13, in <module>
if population > max_population:
TypeError: unorderable types: str() > int()
Code:
with open('StateCensus2010.txt', 'r') as census_file:
while True:
try:
state_name = census_file.readline()
state_abv = census_file.readline()
population = int(census_file.readline())
except IOError:
break
# data processing here
max_population = 0
for population in census_file:
if population > max_population:
max_population = population
print(max_population)
As the data is in consistent order; Statename, State Abv, Population. So you just need to read the lines one time, and display all three 3 information. Below is the sample code.
average = 0.0
total = 0.0
state_min = 999999999999
state_max = 0
statename_min = ''
statename_max = ''
texas_population = 0
with open('StateCensus2010.txt','r') as file:
# split new line, '\n' here means newline
data = file.read().split('\n')
# get the length of the data by using len() method
# there are 50 states in the text file
# each states have 3 information stored,
# state name, state abreviation, population
# that's why length of data which is 150/3 = 50 states
state_total = len(data)/3
# this count is used as an index for the list
count = 0
for i in range(int(state_total)):
statename = data[count]
state_abv = data[count+1]
population = int(data[count+2])
print('Statename : ',statename)
print('State Abv : ',state_abv)
print('Population: ',population)
print()
# sum all states population
total += population
if population > state_max:
state_max = population
statename_max = statename
if population < state_min:
state_min = population
statename_min = statename
if statename == 'Texas':
texas_population = population
# add 3 because we want to jump to next state
# for example the first three lines is Alabama info
# the next three lines is Alaska info and so on
count += 3
# divide the total population with number of states
average = total/state_total
print(str(average))
print('Lowest population state :', statename_min)
print('Highest population state :', statename_max)
print('Texas population :', texas_population)
This problem is pretty easy using pandas.
Code:
states = []
for line in data:
states.append(
dict(state=line.strip(),
abbrev=next(data).strip(),
pop=int(next(data)),
)
)
df = pd.DataFrame(states)
print(df)
print('\nmax population:\n', df.ix[df['pop'].idxmax()])
print('\nmin population:\n', df.ix[df['pop'].idxmin()])
print('\navg population:\n', df['pop'].mean())
print('\nAZ population:\n', df[df.abbrev == 'AZ'])
Test Data:
from io import StringIO
data = StringIO(u'\n'.join([x.strip() for x in """
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
""".split('\n')[1:-1]]))
Results:
abbrev pop state
0 AL 4802982 Alabama
1 AK 721523 Alaska
2 AZ 6412700 Arizona
3 AR 2926229 Arkansas
4 CA 37341989 California
max population:
abbrev CA
pop 37341989
state California
Name: 4, dtype: object
min population:
abbrev AK
pop 721523
state Alaska
Name: 1, dtype: object
avg population:
10441084.6
AZ population:
abbrev pop state
2 AZ 6412700 Arizona
Another pandas solution, from the interpreter:
>>> import pandas as pd
>>>
>>> records = [line.strip() for line in open('./your.txt', 'r')]
>>>
>>> df = pd.DataFrame([records[i:i+3] for i in range(0, len(records), 3)],
... columns=['State', 'Code', 'Pop']).dropna()
>>>
>>> df['Pop'] = df['Pop'].astype(int)
>>>
>>> df
State Code Pop
0 Alabama AL 4802982
1 Alaska AK 721523
2 Arizona AZ 6412700
3 Arkansas AR 2926229
4 California CA 37341989
>>>
>>> df.ix[df['Pop'].idxmax()]
State California
Code CA
Pop 37341989
Name: 4, dtype: object
>>>
>>> df.ix[df['Pop'].idxmin()]
State Alaska
Code AK
Pop 721523
Name: 1, dtype: object
>>>
>>> df['Pop'].mean()
10441084.6
>>>
>>> df.ix[df['Code'] == 'AZ' ]
State Code Pop
2 Arizona AZ 6412700
Please try this the earlier code was not python 3 compatible. It supported python 2.7
def extract_data(state):
total_population = 0
for states, stats in state.items():
population = stats.get('population')
state_name = stats.get('state_name')
states = states
total_population = population + total_population
if 'highest' not in vars():
highest = population
higherst_state_name = state_name
highest_state = states
if 'lowest' not in vars():
lowest = population
lowest_state_name = state_name
lowest_state = states
if highest < population:
highest = population
higherst_state_name = state_name
highest_state = states
if lowest > population:
lowest = population
lowest_state_name = state_name
lowest_state = states
print(highest_state, highest)
print(lowest_state, lowest)
print(len(state))
print(int(total_population/len(state)))
print(state.get('TX').get('population'))
def main():
# Open the StateCensus2010.txt file.
census_file = open('states.txt', 'r')
# Read the state name
state_name = census_file.readline()
state = {}
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
if state_abv in state:
state[state_abv].update({'population': population, 'state_name': state_name})
else:
state.setdefault(state_abv,{'population': population, 'state_name': state_name})
state_name = census_file.readline()
census_file.close()
return state
state=main()
extract_data(state)
Im trying to develop a mapreduce program to show the city with max temperature from a text file.
The text file "temperatures.txt" have this content (city and temperature):
City1 10
City2 11
City3 4
City4 20
...
city10000 22
In this example, the result I want is to print this last line, that have higher temperature:
city10000 22
I have the reducer file like this:
import sys
current_city = None
current_max = 0
city = None
for line in sys.stdin:
line = line.strip()
city, temperature = line.rsplit('\t', 1)
try:
temperature = float(temperature)
except ValueError:
continue
if current_city == city:
if temperature > current_max:
current_max = temperature
else:
if current_city:
print '%s\t%s' % (current_city, current_max)
current_max = temperature
current_city = city
if current_city == city:
print '%s\t%s' % (current_city, current_max)
But, when I test this reducer.py file, Im having always the same result, Im getting always all cities and temperatures, like this:
City1 10
City2 11
City3 4
City4 20
...
city10000 22
Do you see anything wrong in my reducer file?
I just want to show the city with max temperature, in this case city with max temperature is city10000, so I want only this result:
city10000 22
First of all let me explain where I believe the code has gone wrong and then I will supply a working example. The problem is with the if else statement in the reducer.
Here is the if part:
if current_city == city:
if temperature > current_max:
current_max = temperature
This will only happen if the same city is listed twice, more importantly this is the only place where the code checks if the temperature of the new city is bigger than the current_max.
I suspect most of the time will be spent in the else part of the statement:
else:
if current_city:
print '%s\t%s' % (current_city, current_max)
current_max = temperature
current_city = city
There are two problems here:
The program will always print a line when current_city is defined. This is what is producing your list of cities from the reducer.
The program also assists the current_max variable without check if temperature is bigger.
Here is a reducer that should work:
import sys
current_city = None
current_max = 0
city = None
for line in sys.stdin:
line = line.strip()
city, temperature = line.rsplit('\t', 1)
try:
temperature = float(temperature)
except ValueError:
continue
if temperature > current_max:
current_max = temperature
current_city = city
print '%s\t%s' % (current_city, current_max)
The last thing I would mention is that it is not a good idea setting current_max = 0. Temperatures in celsius can easily be below zero. If your list of cities and temperature were during winter, there is a possibility that none of them had a temperature over 0 and the code would return:
None 0.0
I don't know much about python. But you can follow below scenario:
-> create a map to store key as city and value as temperature.
-> Now, store first five cities with their temperature in map.
-> After 5 cities, compare the temperature of each city with all 5 cities in map. If temperature of any city in the map is less then replace that city and their temperature with new city.
-> At the end, You can print the map. This will get the 5 cities with maximum temperature.
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Weather {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String year = line.substring(15, 19);
int airTemperature;
if (line.charAt(87) == '+') {
airTemperature= Integer.parseInt(line.substring(88, 92));
}
else
airTemperature= - Integer.parseInt(line.substring(88, 92));
if(airTemperature!=9999 && airTemperature!=-9999){
airTemperature/=10;
context.write(new Text(year),new IntWritable(airTemperature));
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int maxValue=Integer.MIN_VALUE;
Iterator<IntWritable> itr = values.iterator();
while(itr.hasNext()){
maxValue = Math.max(maxValue,itr.next().get());
}
context.write(key, new IntWritable(maxValue));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "temparature");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("hdfs://localhost:8020/input/"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:8020/output/"));
// FileInputFormat.addInputPath(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Dataset of Weather data:
0029029070999991901010106004+64333+023450FM-12+000599999V0202701N015919999999N0000001N9-00781+99999102001ADDGF108991999999999999999999