Problem with request to Foursquare API with python - python

I have a pd dataframe df_data with names of cities, latitude and longitude.
I try to create a Foursquare request with my credentials to find all the malls located within a specific radius around those cities.
Malls_id is the Foursquare defined id for malls.
I got an error with my request.
I think the error comes from adding categoryId but I don't find the problem.
What am I doing wrong?
Thanks
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=5):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
Malls_id,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
# Run the above function on each location and create a new dataframe called location_venues and display it.
location_venues = getNearbyVenues(names=df_data['City'],
latitudes=df_data['Latitude'],
longitudes=df_data['Longitude']
)
It runs over all my locations
Warsaw
Carmel
Chesterton
Granger
Plainfield
and then it stops.
Here is the full stack trace:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-537f5e1b4f8a> in <module>()
2 location_venues = getNearbyVenues(names=df_data['City'],
3 latitudes=df_data['Latitude'],
----> 4 longitudes=df_data['Longitude']
5 )
<ipython-input-20-23c3db7fc2a3> in getNearbyVenues(names, latitudes, longitudes, radius, LIMIT)
36 'Venue Latitude',
37 'Venue Longitude',
---> 38 'Venue Category']
39
40 return(nearby_venues)
/opt/conda/envs/DSX-Python35/lib/python3.5/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
3625 try:
3626 object.__getattribute__(self, name)
-> 3627 return object.__setattr__(self, name, value)
3628 except AttributeError:
3629 pass
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
/opt/conda/envs/DSX-Python35/lib/python3.5/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
557
558 def _set_axis(self, axis, labels):
--> 559 self._data.set_axis(axis, labels)
560 self._clear_item_cache()
561
/opt/conda/envs/DSX-Python35/lib/python3.5/site-packages/pandas/core/internals.py in set_axis(self, axis, new_labels)
3067 raise ValueError('Length mismatch: Expected axis has %d elements, '
3068 'new values have %d elements' %
-> 3069 (old_len, new_len))
3070
3071 self.axes[axis] = new_labels
ValueError: Length mismatch: Expected axis has 0 elements, new values have 7 elements ```

Related

BulkIndexError: ('2 document(s) failed to index.') - Elasticsearch + Python

At first I found some null values in my preprocessed data, so removed those.
(Here's my Data Cleaning Code - and the respective outputs enclosed in '''Comments''')
Cleaning and Preprocessing
df_merged[df_merged.abstract_x != df_merged.abstract_y].shape
#this means out of the 25000 samples, abstract is not matching between metadata and pdf data
'''(22728, 22)'''
# check metadata abstract column to see if null values exist
df_merged.abstract_x.isnull().sum()
'''3363'''
# Check pdf_json abstract to see if null values exist
df_merged.abstract_y.isnull().sum()
'''0'''
#Since the abstract_x from metadata is more reliable , we will use it but only fill by abstract_y text when abstract_x value is null
# Convert all columns to string and then replace abstract_y values
#df = df.astype(str)
df_merged['abstract_y'] = df_merged['abstract_y'].astype(str)
df_merged['abstract_y'] = np.where(df_merged['abstract_y'].map(len) > 50, df_merged['abstract_y'], 'na')
df_merged.loc[df_merged.abstract_x.isnull() & (df_merged.abstract_y != 'na'), 'abstract_x'] = df_merged[df_merged.abstract_x.isnull() & (df_merged.abstract_y != 'na')].abstract_y #we want to overwrite the abstract_x column and abstract_y has to be not na
df_merged.abstract_x.isnull().sum()
'''
2745
'''
df_merged.rename(columns={'abstract_x': 'abstract'}, inplace=True)
df_merged.columns
'''
Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
'url', 's2_id', 'abstract_y', 'body_text_x', 'body_text_y'],
dtype='object')
'''
df_merged = df_merged.drop(['abstract_y'], axis=1)
df_merged.columns
'''
Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
'url', 's2_id', 'body_text_x', 'body_text_y'],
dtype='object')
'''
(df_merged.body_text_x != df_merged.body_text_y).sum()
'''25000'''
df_merged.body_text_x.isnull().sum()
'''1526'''
df_merged.body_text_y.isnull().sum()
'''5238'''
df_merged[df_merged.body_text_x.isnull() & df_merged.body_text_y.notnull()].shape
'''(1447, 21)'''
#when the body_text_y is not null, we'll be putting, bodytext y into x
df_merged.loc[df_merged.body_text_y.notnull(), 'body_text_x'] = df_merged.loc[df_merged.body_text_y.notnull(), 'body_text_y']
df_merged.body_text_x.isnull().sum()
'''79'''
df_merged.columns
'''
Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
'url', 's2_id', 'body_text_x', 'body_text_y'],
dtype='object')
'''
df_merged.rename(columns={'body_text_x': 'body_text'}, inplace=True)
df_merged = df_merged.drop(['body_text_y'], axis=1)
df_merged.columns
'''
Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
'url', 's2_id', 'body_text'],
dtype='object')
'''
df_final = df_merged[['sha', 'title', 'abstract', 'publish_time', 'authors', 'url', 'body_text']]
df_final.head()
sha title abstract publish_time authors url body_text
0 1cbf95a2c3a39e5cc80a5c4c6dbcec7cc718fd59 Genomic Evolution of Severe Acute Respiratory ... Abstract Recent emergence of severe acute resp... 2020-08-31 Jacob, Jobin John; Vasudevan, Karthick; Veerar... https://api.elsevier.com/content/article/pii/S... The outbreak of severe acute respiratory syndr...
1 7dc6943ca46a1093ece2594002d61efdf9f51f28 Impact of COVID-19 on COPD and Asthma admissio... Asthma and Chronic Obstructive Pulmonary Disea... 2020-12-10 Sykes, Dominic L; Faruqi, Shoaib; Holdsworth, ... https://www.ncbi.nlm.nih.gov/pubmed/33575313/;... The COVID-19 pandemic has led to an overall re...
2 5b127336f68f3dca83981d0142eda472634378f0 Programmable System of Cas13-Mediated RNA Modi... Clustered regularly interspaced short palindro... 2021-07-27 Tang, Tian; Han, Yingli; Wang, Yuran; Huang, H... https://www.ncbi.nlm.nih.gov/pubmed/34386490/;... Prokaryotic clustered regularly interspaced sh...
3 aafbe282248436380dd737bae844725882df2249 Are You Tired of Working amid the Pandemic? Th... With the outbreak of novel coronavirus in 2019... 2020-12-09 Chen, Huaruo; Liu, Fan; Pang, Liman; Liu, Fei;... https://doi.org/10.3390/ijerph17249188; https:... In the outbreak of novel coronavirus pneumonia...
4 4013a7e351c40d2bb7fdfe7f185d2ef9b1a872e6 Viral Sepsis in Children Sepsis in children is typically presumed to be... 2018-09-18 Gupta, Neha; Richter, Robert; Robert, Stephen;... https://www.ncbi.nlm.nih.gov/pubmed/30280095/;... The true incidence of viral sepsis, particular...
df_final = df_final.dropna(axis=0,subset=['abstract', 'body_text'])
df_final.isnull().sum()
'''
sha 0
title 0
abstract 0
publish_time 0
authors 104
url 0
body_text 0
dtype: int64
'''
df_final.shape
'''(22186, 7)'''
df_final.to_csv('FINAL_CORD_DATA.csv', index=False)
''')
Whenever I try to use the Sample Dataset that I created, in my es_populate notebook, using the sparse retriever, I keep getting
BulkIndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19912/2533749049.py in <module>
----> 1 document_store.write_documents(final_dicts)
~\anaconda3\lib\site-packages\haystack\document_store\elasticsearch.py in write_documents(self, documents, index, batch_size, duplicate_documents)
426 # Pass batch_size number of documents to bulk
427 if len(documents_to_index) % batch_size == 0:
--> 428 bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
429 documents_to_index = []
430
~\anaconda3\lib\site-packages\elasticsearch\helpers\actions.py in bulk(client, actions, stats_only, *args, **kwargs)
388 # make streaming_bulk yield successful results so we can count them
389 kwargs["yield_ok"] = True
--> 390 for ok, item in streaming_bulk(client, actions, *args, **kwargs):
391 # go through request-response pairs and detect failures
392 if not ok:
~\anaconda3\lib\site-packages\elasticsearch\helpers\actions.py in streaming_bulk(client, actions, chunk_size, max_chunk_bytes, raise_on_error, expand_action_callback, raise_on_exception, max_retries, initial_backoff, max_backoff, yield_ok, *args, **kwargs)
309
310 try:
--> 311 for data, (ok, info) in zip(
312 bulk_data,
313 _process_bulk_chunk(
~\anaconda3\lib\site-packages\elasticsearch\helpers\actions.py in _process_bulk_chunk(client, bulk_actions, bulk_data, raise_on_exception, raise_on_error, *args, **kwargs)
245 resp=resp, bulk_data=bulk_data, raise_on_error=raise_on_error
246 )
--> 247 for item in gen:
248 yield item
249
~\anaconda3\lib\site-packages\elasticsearch\helpers\actions.py in _process_bulk_chunk_success(resp, bulk_data, raise_on_error)
186
187 if errors:
--> 188 raise BulkIndexError("%i document(s) failed to index." % len(errors), errors)
189
190
BulkIndexError: ('2 document(s) failed to index.', [{'index': {'_index': 'document', '_type': '_doc', '_id': '9d04e1c37a299818d82416898ffe22d6', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'json_parse_exception', 'reason': "Non-standard token 'NaN': enable JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS to allow\n at [Source: (ByteArrayInputStream); line: 1, column: 217076]"}}, 'data': {'text': 'Increase
My method of using the document store was.
# Connect to Elasticsearch
from haystack.document_store import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
C:\Users\manan\anaconda3\lib\site-packages\elasticsearch\connection\base.py:190: ElasticsearchDeprecationWarning: Elasticsearch built-in security features are not enabled. Without authentication, your cluster could be accessible to anyone. See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/security-minimal-setup.html to enable security.
warnings.warn(message, category=ElasticsearchDeprecationWarning)
02/20/2022 00:58:28 - INFO - elasticsearch - HEAD http://localhost:9200/ [status:200 request:0.227s]
02/20/2022 00:58:28 - INFO - elasticsearch - HEAD http://localhost:9200/document [status:200 request:0.015s]
02/20/2022 00:58:28 - INFO - elasticsearch - GET http://localhost:9200/document [status:200 request:0.011s]
02/20/2022 00:58:28 - INFO - elasticsearch - PUT http://localhost:9200/document/_mapping [status:200 request:0.087s]
02/20/2022 00:58:28 - INFO - elasticsearch - HEAD http://localhost:9200/label [status:200 request:0.006s]
document_store.write_documents(final_dicts)
02/20/2022 00:58:34 - INFO - elasticsearch - POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:3.887s]
02/20/2022 00:58:38 - INFO - elasticsearch - POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:3.464s]
followed by the above error.
I'm very new to this, and would appreciate any help that could come my way.

Using Foursquare API with python

How do i extract the list of hospitals in each neighborhood in a city using foursquare API? and putting it into a data frame.
This is what i am trying to achieve as a DataFrame:
Neighborhood No. of hospitals
0 Neighborhood1 5
1 Neighborhood2 1
2 Neighborhood3 3
3 Neighborhood4 4
4 Neighborhood5 5
I am trying out a code from a previous tutorial to achieve this, I expected the error but i don't know where to start.
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={}&query=supermarket,{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
Next cell:
Toronto_venues = getNearbyVenues(names=Toronto_df['Neighborhood'],
latitudes=Toronto_df['Latitude'],
longitudes=Toronto_df['Longitude']
)
Thank you in advance!
Thank you for your response,
Toronto_venues = getNearbyVenues(names=Toronto_df['Neighborhood'],
latitudes=Toronto_df['Latitude'],
longitudes=Toronto_df['Longitude']
)
But this cell gives back this error,
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-16-03f6027f84a2> in <module>()
1 Toronto_venues = getNearbyVenues(names=Toronto_df['Neighborhood'],
2 latitudes=Toronto_df['Latitude'],
----> 3 longitudes=Toronto_df['Longitude']
4 )
<ipython-input-13-0c3ca691c166> in getNearbyVenues(names, latitudes, longitudes, radius)
16
17 # make the GET request
---> 18 results = requests.get(url).json()["response"]['groups'][0]['items']
19
20 # return only relevant information for each nearby venue
KeyError: 'groups'
You need to do value counts, then separate out any column and rename it.
df = Toronto_venues.groupby('Neighborhood').count() # Get the counts
df = pd.DataFrame(df['Venue']) # Convert the counts to a dataframe
df.rename(columns={'Venue': 'No. of Hospitals'}, inplace=True)
At this point you will have a dataframe, but the first column which is your hospital names, is the index. If you want to pull it out into a column, then use this code as well:
df.reset_index(level=0, inplace=True)

Calling a request from foursquare

I am trying to use the below function to retrieve venues for different locations but I keep getting this error and I can't figure it out because I used it before and it worked perfectly but with different locations. Please help!
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighbourhood',
'Neighbourhood Latitude',
'Neighbourhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)`
london_venues = getNearbyVenues(names=df['Location'],
latitudes=df['Latitude'],
longitudes=df['Longitude']
)
This is the error I am getting
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-180-4f13fb178c94> in <module>
1 london_venues = getNearbyVenues(names=df['Location'],
2 latitudes=df['Latitude'],
----> 3 longitudes=df['Longitude']
4 )
<ipython-input-177-d194f1c67c83> in getNearbyVenues(names, latitudes, longitudes, radius)
16
17 # make the GET request
---> 18 results = requests.get(url).json()["response"]['groups'][0]['items']
19
20 # return only relevant information for each nearby venue
KeyError: 'groups'
you might have exceeded your API call limit if you are using sandbox account, or there is no such key named "groups". If not, then please provide the coordinates of the location.

How can I loop through a dataframe to create separate Folium maps?

I'm trying to examine the sushi venues within 5 different cities, using foursqaure.
I can get the data and filter it correctly. Code below.
city = {'City':['Brunswick','Auckland','Wellington','Christchurch','Hamilton','Ponsonby'],
'Latitude':[-37.7670,-36.848461,-41.28664,-43.55533,-37.78333,-36.8488],
'Longitude':[144.9621,174.763336,174.77557,172.63333,175.28333,174.7381]}
df_location= pd.DataFrame(city, columns = ['City','Latitude','Longitude'])
def getNearbyVenues(names, latitudes, longitudes, radius=2000, LIMIT=100):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT,
"4bf58dd8d48988d1d2941735")
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
venues_list.append([(
name,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = [
'City',
'Venue',
'Venue Latitude',
'Venue Longitude',]
return(nearby_venues)
sushi_venues = getNearbyVenues(names = df_location['City'],
latitudes = df_location['Latitude'],
longitudes = df_location['Longitude'])
cities = df_location["City"]
latitude = df_location["Latitude"]
longitude = df_location["Longitude"]
I'm getting stuck on creating the maps and I'm not sure how I should iterate through the cities to create a map for each.
Here's the code I have.
maps = {}
for city in cities:
maps[city] = folium.Map(location = [latitude, longitude],zoom_start=10)
for lat, lng, neighborhood in zip(sushi_venues['Venue Latitude'], sushi_venues['Venue Longitude'], sushi_venues['Venue']):
label = '{}'.format(neighborhood)
label = folium.Popup(label, parse_html = True)
folium.CircleMarker(
[lat, lng],
radius = 5,
popup = label,
color = 'blue',
fill = True,
fill_color = '#3186cc',
fill_opacity = 0.7,
parse_html = False).add_to(maps[city])
maps[cities[0]]
For this code, 'maps[cities[0]]' brings up a blank folio map.
If I change the code to reference the row of the city in df_location, e.g
maps = {}
for city in cities:
maps[city] = folium.Map(location = [latitude[0], longitude[0],zoom_start=10)
Then 'maps[cities[0]]' brings up a correctly labeled Folio map of Brunswick with the corresponding venues marked.
So my question is, how can I correctly iterate through all 5 cities, so that I can pull a new map for each without changing the location each time? I'm unable to zip the locations because it needs to be a single lat/long to initialize the Folium map.
Thanks so much for your help!

I use to_gbq on pandas for updating Google BigQuery and get GenericGBQException

While trying to use to_gbq for updating Google BigQuery table, I get a response of:
GenericGBQException: Reason: 400 Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1.
My code:
gbq.to_gbq(mini_df,'Name-of-Table','Project-id',chunksize=10000,reauth=False,if_exists='append',private_key=None)
and my dataframe of mini_df looks like:
date request_number name feature_name value_name value
2018-01-10 1 1 "a" "b" 0.309457
2018-01-10 1 1 "c" "d" 0.273748
While I'm running the to_gbq, and there's no table on the BigQuery, I can see that the table is created with the next schema:
date STRING NULLABLE
request_number STRING NULLABLE
name STRING NULLABLE
feature_name STRING NULLABLE
value_name STRING NULLABLE
value FLOAT NULLABLE
What am I doing wrong? How can I solve this?
P.S, rest of the exception:
BadRequest Traceback (most recent call last)
~/anaconda3/envs/env/lib/python3.6/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize)
589 destination_table,
--> 590 job_config=job_config).result()
591 except self.http_error as ex:
~/anaconda3/envs/env/lib/python3.6/site-packages/google/cloud/bigquery/job.py in result(self, timeout)
527 # TODO: modify PollingFuture so it can pass a retry argument to done().
--> 528 return super(_AsyncJob, self).result(timeout=timeout)
529
~/anaconda3/envs/env/lib/python3.6/site-packages/google/api_core/future/polling.py in result(self, timeout)
110 # Pylint doesn't recognize that this is valid in this case.
--> 111 raise self._exception
112
BadRequest: 400 Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1.
During handling of the above exception, another exception occurred:
GenericGBQException Traceback (most recent call last)
<ipython-input-28-195df93249b6> in <module>()
----> 1 gbq.to_gbq(mini_df,'Name-of-Table','Project-id',chunksize=10000,reauth=False,if_exists='append',private_key=None)
~/anaconda3/envs/env/lib/python3.6/site-packages/pandas/io/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, verbose, reauth, if_exists, private_key)
106 chunksize=chunksize,
107 verbose=verbose, reauth=reauth,
--> 108 if_exists=if_exists, private_key=private_key)
~/anaconda3/envs/env/lib/python3.6/site-packages/pandas_gbq/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, verbose, reauth, if_exists, private_key, auth_local_webserver)
987 table.create(table_id, table_schema)
988
--> 989 connector.load_data(dataframe, dataset_id, table_id, chunksize)
990
991
~/anaconda3/envs/env/lib/python3.6/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize)
590 job_config=job_config).result()
591 except self.http_error as ex:
--> 592 self.process_http_error(ex)
593
594 rows = []
~/anaconda3/envs/env/lib/python3.6/site-packages/pandas_gbq/gbq.py in process_http_error(ex)
454 # <https://cloud.google.com/bigquery/troubleshooting-errors>`__
455
--> 456 raise GenericGBQException("Reason: {0}".format(ex))
457
458 def run_query(self, query, **kwargs):
GenericGBQException: Reason: 400 Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1.
I've had the very same problem.
In my case it depended on the data type object of the Data Frame.
I've had three columns externalId, mappingId, info. For none of those fields I set a data type and let pandas do it's magic.
It decided to set all three column data types to object. Problem is, internally the to_gbq component uses the to_json component. For some reason or another this output omits the quotes around the data field if the type of the field is object but holds only numerical values.
So Google Big Query needed this
{"externalId": "12345", "mappingId":"abc123", "info":"blerb"}
but got this:
{"externalId": 12345, "mappingId":"abc123", "info":"blerb"}
And because the mapping of the field was STRING in Google Big Query, the import process failed.
Two solutions came up.
Solution 1 - Change the data type of the column
A simple type conversion helped with this issue. I also had to change the data type in Big Query to INTEGER.
df['externalId'] = df['externalId'].astype('int')
If this is the case, Big Query can consume fields without quotes as the JSON standard says.
Solution 2 - Make sure the string field is a string
Again, this is setting the data type. But since we set it explicitly to String, the export with to_json prints out a quoted field and everything worked fine.
df['externalId'] = df['externalId'].astype('str')

Categories

Resources