Find coordinates in wikipedia pages iterating over a list

Find coordinates in wikipedia pages iterating over a list - python

Probably this is a simple question, but my experience in for loop is very limited.
I was trying to adapt the solution in this page https://www.mediawiki.org/wiki/API:Geosearch with some simple examples that i have, but the result is not what i expected.
For example:
I have this simple data frame:
df= pd.DataFrame({'City':['Sesimbra','Ciudad JuÃ¡rez','31100 Treviso','Ramada Portugal','Olhão'],
'Country':['Portugal','México','Itália','Portugal','Portugal']})
I created a list based on cities:
lista_cidades = list(df['City'])
and i would like to iterate over this list to get the coordinates (decimal, preferably)
So far i tried this approach:
import requests
lng_dict = {}
lat_dict = {}
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
"action": "query",
"format": "json",
"titles": [lista_cidades],
"prop": "coordinates"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for i in range(len(lista_cidades)):
for k, v in PAGES.items():
try:
lat_dict[lista_cidades[i]] = str(v['coordinates'][0]['lat'])
lng_dict[lista_cidades[i]] = str(v['coordinates'][0]['lon'])
except:
pass
but it looks like the code doesn't iterate over the list and always returns the same coordinate
For example, when i call the dictionary with latitude coordinates, this is what i get
lng_dict
{'Sesimbra': '-7.84166667',
'Ciudad JuÃ¡rez': '-7.84166667',
'31100 Treviso': '-7.84166667',
'Ramada Portugal': '-7.84166667',
'Olhão': '-7.84166667'}
What should i do to solve this?
Thanks in advance

I think the query returns only one result, it will take only the last city from you list (in your cas the "Olhão" coordinates).
You can check it by logging the DATA content.
I do not know about wikipedia API, but either your call lack a parameter (documentation should give you the information) or you have to call the API for each city like :
import pandas as pd
import requests
df = pd.DataFrame({'City': ['Sesimbra', 'Ciudad JuÃ¡rez', '31100 Treviso', 'Ramada Portugal', 'Olhão'],
'Country': ['Portugal', 'México', 'Itália', 'Portugal', 'Portugal']})
lista_cidades = list(df['City'])
lng_dict = {}
lat_dict = {}
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
for city in lista_cidades:
PARAMS = {
"action": "query",
"format": "json",
"titles": city,
"prop": "coordinates"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for k, v in PAGES.items():
try:
lat_dict[city] = str(v['coordinates'][0]['lat'])
lng_dict[city] = str(v['coordinates'][0]['lon'])
except:
pass

Related

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value

You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

Is there a way to print out certain elements of the JSON file in python?

I am using a (dutch) weather API and the result is showing a lot of information. However, I only want to print the temperature and location. Is there a way to filter out these keys?
from pip._vendor import requests
import json
response = requests.get(" http://weerlive.nl/api/json-data-10min.php?key=demo&locatie=52.0910879,5.1124231")
def jprint(obj):
weer = json.dumps(obj, indent=2)
print(weer)
jprint(response.json())
result:
{
"liveweer": [
{
"place": "Utrecht",
"temp": "7.7",
"gtemp": "5.2",
"summa": "Dry after rain",
"lv": "89",
etc.
How do I only print out the place and temp?
Thanks in advance

import requests
response = requests.get(" http://weerlive.nl/api/json-data-10min.php?key=demo&locatie=52.0910879,5.1124231")
data = response.json()
for station in data["liveweer"]:
print(f"Temp in {station['plaats']} is {station['temp']}")
output
Temp in Utrecht is 8.0
Note that you can use convenient Response.json() method

If you expect the API to returns you a list of place, you can do:
>>> {'liveweer': [{'plaats': item['plaats'], 'temp': item['temp']}] for item in response.json()['liveweer']}
{'liveweer': [{'plaats': 'Utrecht', 'temp': '8.0'}]}

Try this:
x=response.json()
print(x['liveweer'][0]['place'])
print(x['liveweer'][0]['temp'])

If you are only interested in working with place and temp I would advise making a new dictionary.
import requests
r = requests.get(" http://weerlive.nl/api/json-data-10min.php?key=demo&locatie=52.0910879,5.1124231")
if r.status_code == 200:
data = {
"place" : r.json()['liveweer'][0]["place"],
"temp": r.json()['liveweer'][0]["temp"],
}
print(data)

How can I extract values from Tableau on this webpage

I am trying to extract the "mobility index" values for each state and county from this webpage:
https://www.cuebiq.com/visitation-insights-mobility-index/
The preferred output would be a panel data of place (state/county) by date for all available places and dates.
There is another thread (How can I scrape tooltips value from a Tableau graph embedded in a webpage) with a similar question. I tried to follow the solution there but it doesn't seem to work for my case.
Thanks a lot in advance.
(A way that I have tried is to download PDF files generated from Tableau, which would contain all counties' value on a specific date. However, I still need to find a way to make request for each date in the data. Anyway, let me know if you have a better idea than this route).

This tableau data url doesn't return any data. In fact, it only render images of the values (canvas probably) and I'm guessing it detects click based on coordinate. Probably, it's made this way to cache the value and render quickly.
But when you click on a state, it actually returns data but it seems it doesn't always returns the result for the state (but works the individual county).
The solution I've found is to use the tooltip to get the data for the state. When you click the state, it generates a request like this :
POST https://public.tableau.com/{path}/{session_id}/commands/tabsrv/render-tooltip-server
with the following form param :
worksheet: US Map - State - CMI
dashboard: CMI
tupleIds: [18]
vizRegionRect: {"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":null}
allowHoverActions: false
allowPromptText: true
allowWork: false
useInlineImages: true
where tupleIds: [18] refers to the index of the state in a list of states in reverse alphabetical order like this :
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
It gives a json with the html of the tooltip which has the CMI and YoY values you want to extract :
{
"vqlCmdResponse": {
"cmdResultList": [{
"commandName": "tabsrv:render-tooltip-server",
"commandReturn": {
"tooltipText": "{\"htmlTooltip\": \"<HTML HERE WITH THE VALUES>\"}]},\"overlayAnchors\":[]}"
}
}]
}
}
The only caveat is that you'll hava to make one request per state :
import requests
from bs4 import BeautifulSoup
import json
import time
data_host = "https://public.tableau.com"
r = requests.get(
f"{data_host}/views/CMI-2_0/CMI",
params= {
":showVizHome":"no",
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{data_host}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
data = []
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
for stateIndex, state in enumerate(stateNames):
time.sleep(0.5) #for throttling
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabsrv/render-tooltip-server',
data = {
"worksheet": "US Map - State - CMI",
"dashboard": "CMI",
"tupleIds": f"[{stateIndex+1}]",
"vizRegionRect": json.dumps({"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":None}),
"allowHoverActions": "false",
"allowPromptText": "true",
"allowWork": "false",
"useInlineImages": "true"
})
tooltip = json.loads(r.json()["vqlCmdResponse"]["cmdResultList"][0]["commandReturn"]["tooltipText"])["htmlTooltip"]
soup = BeautifulSoup(tooltip, "html.parser")
rows = [
t.find("tr").find_all("td")
for t in soup.find_all("table")
]
entry = { "state": state }
for row in rows:
if (row[0].text == "Mobility Index:"):
entry["CMI"] = "".join([t.text.strip() for t in row[1:]])
if row[0].text == "YoY (%):":
entry["YoY"] = "".join([t.text.strip() for t in row[1:]])
print(entry)
data.append(entry)
print(data)
Try this on repl.it
To get the county information it's the same as this post using the select endpoint which gives you the data with the same format as the post you've linked in your question
The following will extract data for all county and state :
import requests
from bs4 import BeautifulSoup
import json
import time
data_host = "https://public.tableau.com"
worksheet = "US Map - State - CMI"
dashboard = "CMI"
r = requests.get(
f"{data_host}/views/CMI-2_0/CMI",
params= {
":showVizHome":"no",
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{data_host}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
data = []
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
for stateIndex, state in enumerate(stateNames):
time.sleep(0.5) #for throttling
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabsrv/render-tooltip-server',
data = {
"worksheet": worksheet,
"dashboard": dashboard,
"tupleIds": f"[{stateIndex+1}]",
"vizRegionRect": json.dumps({"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":None}),
"allowHoverActions": "false",
"allowPromptText": "true",
"allowWork": "false",
"useInlineImages": "true"
})
tooltip = json.loads(r.json()["vqlCmdResponse"]["cmdResultList"][0]["commandReturn"]["tooltipText"])["htmlTooltip"]
soup = BeautifulSoup(tooltip, "html.parser")
rows = [
t.find("tr").find_all("td")
for t in soup.find_all("table")
]
entry = { "state": state }
for row in rows:
if (row[0].text == "Mobility Index:"):
entry["CMI"] = "".join([t.text.strip() for t in row[1:]])
if row[0].text == "YoY (%):":
entry["YoY"] = "".join([t.text.strip() for t in row[1:]])
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabdoc/select',
data = {
"worksheet": worksheet,
"dashboard": dashboard,
"selection": json.dumps({
"objectIds":[stateIndex+1],
"selectionType":"tuples"
}),
"selectOptions": "select-options-simple"
})
entry["county_data"] = r.json()["vqlCmdResponse"]["layoutStatus"]["applicationPresModel"]["dataDictionary"]["dataSegments"]
print(entry)
data.append(entry)
print(data)

API not accepting my JSON data from Python

I'm new to Python and dealing with JSON. I'm trying to grab an array of strings from my database and give them to an API. I don't know why I'm getting the missing data error. Can you guys take a look?
###########################################
rpt_cursor = rpt_conn.cursor()
sql="""SELECT `ContactID` AS 'ContactId' FROM
`BWG_reports`.`bounce_log_dummy`;"""
rpt_cursor.execute(sql)
row_headers=[x[0] for x in rpt_cursor.description] #this will extract row headers
row_values= rpt_cursor.fetchall()
json_data=[]
for result in row_values:
json_data.append(dict(zip(row_headers,result)))
results_to_load = json.dumps(json_data)
print(results_to_load) # Prints: [{"ContactId": 9}, {"ContactId": 274556}]
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
}
targetlist = '302'
# This is for their PUT to "add multiple contacts to lists".
api_request_url = 'https://api2.xyz.com/api/list/' + str(targetlist)
+'/contactid/Api_Key/' + bwg_apikey
print(api_request_url) #Prints https://api2.xyz.com/api/list/302/contactid/Api_Key/#####
response = requests.put(api_request_url, headers=headers, data=results_to_load)
print(response) #Prints <Response [200]>
print(response.content) #Prints b'{"status":"error","Message":"ContactId is Required."}'
rpt_conn.commit()
rpt_cursor.close()
###########################################################
Edit for Clarity:
I'm passing it this [{"ContactId": 9}, {"ContactId": 274556}]
and I'm getting this response body b'{"status":"error","Message":"ContactId is Required."}'
The API doc gives this as the from to follow for the request body.
[
{
"ContactId": "string"
}
]
When I manually put this data in there test thing I get what I want.
[
{
"ContactId": "9"
},
{
"ContactId": "274556"
}
]
Maybe there is something wrong with json.dumps vs json.load? Am I not creating a dict, but rather a string that looks like a dict?
EDIT I FIGURED IT OUT!:
This was dumb.
I needed to define results_to_load = [] as a dict before I loaded it at results_to_load = json.dumps(json_data).
Thanks for all the answers and attempts to help.

I would recommend you to go and check the API docs to be specific, but from it seems, the API requires a field with the name ContactID which is an array, rather and an array of objects where every object has key as contactId
Or
//correct
{
contactId: [9,229]
}
instead of
// not correct
[{contactId:9}, {contactId:229}]
Tweaking this might help:
res = {}
contacts = []
for result in row_values:
contacts.append(result)
res[contactId] = contacts
...
...
response = requests.put(api_request_url, headers=headers, data=res)

I FIGURED IT OUT!:
This was dumb.
I needed to define results_to_load = [] as an empty dict before I loaded it at results_to_load = json.dumps(json_data).
Thanks for all the answers and attempts to help.

What is the data format returned by the AdWords API TargetingIdeaPage service?

When I query the AdWords API to get search volume data and trends through their TargetingIdeaSelector using the Python client library the returned data looks like this:
(TargetingIdeaPage){
totalNumEntries = 1
entries[] =
(TargetingIdea){
data[] =
(Type_AttributeMapEntry){
key = "KEYWORD_TEXT"
value =
(StringAttribute){
Attribute.Type = "StringAttribute"
value = "keyword phrase"
}
},
(Type_AttributeMapEntry){
key = "TARGETED_MONTHLY_SEARCHES"
value =
(MonthlySearchVolumeAttribute){
Attribute.Type = "MonthlySearchVolumeAttribute"
value[] =
(MonthlySearchVolume){
year = 2016
month = 2
count = 2900
},
...
(MonthlySearchVolume){
year = 2015
month = 3
count = 2900
},
}
},
},
}
This isn't JSON and appears to just be a messy Python list. What's the easiest way to flatten the monthly data into a Pandas dataframe with a structure like this?
Keyword | Year | Month | Count
keyword phrase 2016 2 10

The output is a sudsobject. I found that this code does the trick:
import suds.sudsobject as sudsobject
import pandas as pd
a = [sudsobject.asdict(x) for x in output]
df = pd.DataFrame(a)
Addendum: This was once correct but new versions of the API (I tested
201802) now return a zeep.objects. However, zeep.helpers.serialize_object should do the same trick.
link

Here's the complete code that I used to query the TargetingIdeaSelector, with requestType STATS, and the method I used to parse the data to a useable dataframe; note the section starting "Parse results to pandas dataframe" as this takes the output given in the question above and converts it to a dataframe. Probably not the fastest or best, but it works! Tested with Python 2.7.
"""This code pulls trends for a set of keywords, and parses into a dataframe.
The LoadFromStorage method is pulling credentials and properties from a
"googleads.yaml" file. By default, it looks for this file in your home
directory. For more information, see the "Caching authentication information"
section of our README.
"""
from googleads import adwords
import pandas as pd
adwords_client = adwords.AdWordsClient.LoadFromStorage()
PAGE_SIZE = 10
# Initialize appropriate service.
targeting_idea_service = adwords_client.GetService(
'TargetingIdeaService', version='v201601')
# Construct selector object and retrieve related keywords.
offset = 0
stats_selector = {
'searchParameters': [
{
'xsi_type': 'RelatedToQuerySearchParameter',
'queries': ['donald trump', 'bernie sanders']
},
{
# Language setting (optional).
# The ID can be found in the documentation:
# https://developers.google.com/adwords/api/docs/appendix/languagecodes
'xsi_type': 'LanguageSearchParameter',
'languages': [{'id': '1000'}],
},
{
# Location setting
'xsi_type': 'LocationSearchParameter',
'locations': [{'id': '1027363'}] # Burlington,Vermont
}
],
'ideaType': 'KEYWORD',
'requestType': 'STATS',
'requestedAttributeTypes': ['KEYWORD_TEXT', 'TARGETED_MONTHLY_SEARCHES'],
'paging': {
'startIndex': str(offset),
'numberResults': str(PAGE_SIZE)
}
}
stats_page = targeting_idea_service.get(stats_selector)
##########################################################################
# Parse results to pandas dataframe
stats_pd = pd.DataFrame()
if 'entries' in stats_page:
for stats_result in stats_page['entries']:
stats_attributes = {}
for stats_attribute in stats_result['data']:
#print (stats_attribute)
if stats_attribute['key'] == 'KEYWORD_TEXT':
kt = stats_attribute['value']['value']
else:
for i, val in enumerate(stats_attribute['value'][1]):
data = {'keyword': kt,
'year': val['year'],
'month': val['month'],
'count': val['count']}
data = pd.DataFrame(data, index = [i])
stats_pd = stats_pd.append(data, ignore_index=True)
print(stats_pd)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find coordinates in wikipedia pages iterating over a list - python

Related

Creating multiple dataframe using loop or function

Is there a way to print out certain elements of the JSON file in python?

How can I extract values from Tableau on this webpage

API not accepting my JSON data from Python

What is the data format returned by the AdWords API TargetingIdeaPage service?

Categories

Resources