memory error when retrieving data from Songkick - python

I have built a scraper to retrieve concert data from songkick by using their api. However, it takes a lot of time to retrieve all the data from these artists. After scraping for approximately 15 hours the script is still running but the JSON file doesn’t change anymore. I interrupted the script and I checked if I could access my data with TinyDB. Unfortunately I get the following error. Does anybody know why this is happening?
Error:
('cannot fetch url', 'http://api.songkick.com/api/3.0/artists/8689004/gigography.json?apikey=###########&min_date=2015-04-25&max_date=2017-03-01')
8961344
Traceback (most recent call last):
File "C:\Users\rmlj\Dropbox\Data\concerts.py", line 42, in <module>
load_events()
File "C:\Users\rmlj\Dropbox\Data\concerts.py", line 27, in load_events
print(artist)
File "C:\Python27\lib\idlelib\PyShell.py", line 1356, in write
return self.shell.write(s, self.tags)
KeyboardInterrupt
>>> mydat = db.all()
Traceback (most recent call last):
File "<pyshell#0>", line 1, in <module>
mydat = db.all()
File "C:\Python27\lib\site-packages\tinydb\database.py", line 304, in all
return list(itervalues(self._read()))
File "C:\Python27\lib\site-packages\tinydb\database.py", line 277, in _read
return self._storage.read()
File "C:\Python27\lib\site-packages\tinydb\database.py", line 31, in read
raw_data = (self._storage.read() or {})[self._table_name]
File "C:\Python27\lib\site-packages\tinydb\storages.py", line 105, in read
return json.load(self._handle)
File "C:\Python27\lib\json\__init__.py", line 287, in load
return loads(fp.read(),
MemoryError
below you can find my script
import urllib2
import requests
import json
import csv
import codecs
from tinydb import TinyDB, Query
db = TinyDB('events.json')
def load_events():
MIN_DATE = "2015-04-25"
MAX_DATE = "2017-03-01"
API_KEY= "###############"
with open('artistid.txt', 'r') as f:
for a in f:
artist = a.strip()
print(artist)
url_base = 'http://api.songkick.com/api/3.0/artists/{}/gigography.json?apikey={}&min_date={}&max_date={}'
url = url_base.format(artist, API_KEY, MIN_DATE, MAX_DATE)
# url = u'http://api.songkick.com/api/3.0/search/artists.json?query='+artist+'&apikey=WBmvXDarTCEfqq7h'
try:
r = requests.get(url)
resp = r.json()
if(resp['resultsPage']['totalEntries']):
results = resp['resultsPage']['results']['event']
for x in results:
print(x)
db.insert(x)
except:
print('cannot fetch url',url);
load_events()
db.close()
print ("End of script")

MemoryError is a built in Python exception (https://docs.python.org/3.6/library/exceptions.html#MemoryError) so it looks like the process is out of memory and this isn't really related to Songkick.
This question probably has the information you need to debug this: How to debug a MemoryError in Python? Tools for tracking memory use?

Related

NameError: name 'find_stack_level' is not defined when trying to get an xlsx file from a website

I was trying to import a xlsx file from a website using the
requests
packaged and it returned me an strange error. The code and the error below.
import numpy as np
import matplotlib as plt
import pandas as pd
from io import BytesIO
import requests as rq
url = "http://pdet.mte.gov.br/images/Novo_CAGED/Jan2022/3-tabelas.xlsx"
data = rq.get(url).content
caged = pd.read_excel(BytesIO(data))
Traceback (most recent call last):
File "D:\Perfil\Desktop\UFV\trabalho_econometria.py", line 9, in <module>
caged = pd.read_excel(BytesIO(data))
File "C:\Users\Windows\anaconda3\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "C:\Users\Windows\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 344, in read_excel
data = io.parse(
File "C:\Users\Windows\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1170, in parse
return self._reader.parse(
File "C:\Users\Windows\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 504, in parse
if header is not None and is_list_like(header):
NameError: name 'find_stack_level' is not defined
Was trying to read an xlsx sheet from a website and got a strange error.

Traceback error when using search_engine_parser

I am writing a simple script to search google using the search engine parser it was working fine until yesterday.
All search queries are stored in test.csv file
from search_engine_parser.core.engines.google import Search as GoogleSearch
import csv
with open('/Users/John/Desktop/test.csv') as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
# Check file as empty
if header != None:
for row in csv_reader:
gsearch = GoogleSearch()
gresults = gsearch.search(row)
print(gresults["titles"][0])
getting the error below:
"/Users/John/Documents/python scripts/venv/bin/python" "/Users/John/Documents/python scripts/search_parser2.py"
Samsung Galaxy J7 - Full phone specifications - GSMArena.com
Search for samsung sm-n920c - GSMArena.com
Traceback (most recent call last):
File "/Users/John/Documents/python scripts/venv/lib/python3.7/site-packages/search_engine_parser/core/base.py", line 240, in get_results
search_results = self.parse_result(results, **kwargs)
File "/Users/John/Documents/python scripts/venv/lib/python3.7/site-packages/search_engine_parser/core/base.py", line 151, in parse_result
rdict = self.parse_single_result(each, **kwargs)
File "/Users/John/Documents/python scripts/venv/lib/python3.7/site-packages/search_engine_parser/core/engines/google.py", line 74, in parse_single_result
title = r_elem.find('div', class_='BNeawe').text
AttributeError: 'NoneType' object has no attribute 'text'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/John/Documents/python scripts/search_parser2.py", line 10, in <module>
gresults = gsearch.search(row)
File "/Users/John/Documents/python scripts/venv/lib/python3.7/site-packages/search_engine_parser/core/base.py", line 270, in search
return self.get_results(soup, **kwargs)
File "/Users/John/Documents/python scripts/venv/lib/python3.7/site-packages/search_engine_parser/core/base.py", line 244, in get_results
"The returned results could not be parsed. This might be due to site updates or "
search_engine_parser.core.exceptions.NoResultsOrTrafficError: The returned results could not be parsed. This might be due to site updates or server errors. Drop an issue at https://github.com/bisoncorps/search-engine-parser if this persists
Process finished with exit code 1

unable to load json data

I am unable to load the json data & getting errors which mention below.
My code is ,
import requests
import json
url = 'https://172.28.1.220//actifio/api/info/lsjobhistory?sessionid=cafc8f31-fb39-4020-8172-e8f0085004fd'
ret=requests.get(url,verify=False)
data=json.load(ret)
print(data)
Getting error
Traceback (most recent call last):
File "pr.py", line 7, in <module>
data=json.load(ret)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py", line 293, in load
return loads(fp.read(),
AttributeError: 'Response' object has no attribute 'read'
You dont actually need to import json
try this
import requests
url = 'https://172.28.1.220//actifio/api/info/lsjobhistory?sessionid=cafc8f31-fb39-4020-8172-e8f0085004fd'
ret = requests.get(url,verify=False)
data = ret.json()
print(data)

How do I use pydap library to collect THREDDS data?

I have been trying to use the example get_nomads.py module from Will Holgren which he was nice enough to forward my way. In the code, there is a call to get the THREDDS data as follows:
from pydap.client import open_url
dataset = open_url('https://nomads.ncdc.noaa.gov/thredds/dodsC/gfs-004/201612/20161201/gfs_4_20161201_0000_003.grb2')
Which does not work because (apparently) the old THREDDS server has been decommissioned.
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm 2018.2.4\helpers\pydev\_pydevd_bundle\pydevd_exec2.py", line 3, in Exec
exec(exp, global_vars, local_vars)
File "<input>", line 1, in <module>
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\client.py", line 64, in open_url
dataset = DAPHandler(url, application, session, output_grid).dataset
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\handlers\dap.py", line 51, in __init__
raise_for_status(r)
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\net.py", line 30, in raise_for_status
comment=response.body
webob.exc.HTTPError: 404 Not Found
So looking around I am not able to find a THREDDS server that supports this method of data access.
BTW: I am able to get data as follows:
url = 'http://dtvirt5.deltares.nl:8080/thredds/dodsC/opendap/rijkswaterstaat/jarkus/profiles/transect.nc'
dataset = open_url(url)
<DatasetType with children 'id', 'areacode', 'areaname', 'alongshore', 'cross_shore', 'time', 'time_bounds', 'epsg', 'x', 'y', 'lat', 'lon', 'angle', 'mean_high_water', 'mean_low_water', 'max_cross_shore_measurement', 'min_cross_shore_measurement', 'nsources', 'max_altitude_measurement', 'min_altitude_measurement', 'rsp_x', 'rsp_y', 'rsp_lat', 'rsp_lon', 'time_topo', 'time_bathy', 'origin', 'altitude'>
variable = dataset['id']
print(variable[0:10])
[2000100 2000101 2000102 2000103 2000104 2000105 2000106 2000120 2000140
2000160]
I also see that I can manually download the data from https://www.ncei.noaa.gov/thredds/dodsC/gfs-g4-anl-files/201808/20180828/gfsanl_4_20180828_1800_006.grb2.html
But I cant seem to find argument format to download the data using pydap. I think all I need is a pointer to a real THREDDS server that has the appropriate DDS and DAS files at the same URI location.
Does anyone know how to get the GFS4 GRB files using the pydap client?
Thanks
Haven't tested using pydap, tested using netCDF4 which does very will with THREDDS. This should work using pydap.
dataset = open_url('http://www.ncei.noaa.gov/thredds/dodsC/gfs-g4-anl-files/201612/20161201/gfsanl_4_20161201_0000_003.grb2')
The THREDDS OPeNDAP form for that one file is here:
The main catalog organized by YYYYMM/ is at:
https://www.ncei.noaa.gov/thredds/catalog/gfs-g4-anl-files/catalog.html
All NCEI GFS datasets with links to TDS access can be seen here:
https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/global-forcast-system-gfs
When I try the link you provided Eric with pydap I get this error.
dataset = open_url('http://www.ncei.noaa.gov/thredds/dodsC/gfs-g4-anl-files/201612/20161201/gfsanl_4_20161201_0000_003.grb2')
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm 2018.2.4\helpers\pydev\_pydevd_bundle\pydevd_exec2.py", line 3, in Exec
exec(exp, global_vars, local_vars)
File "<input>", line 1, in <module>
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\client.py", line 64, in open_url
dataset = DAPHandler(url, application, session, output_grid).dataset
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\handlers\dap.py", line 64, in __init__
self.dataset = build_dataset(dds)
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\parsers\dds.py", line 161, in build_dataset
return DDSParser(dds).parse()
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\parsers\dds.py", line 49, in parse
self.consume('dataset')
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\parsers\dds.py", line 41, in consume
token = super(DDSParser, self).consume(regexp)
File "C:\Users\pmoran\jira\slf\venv\lib\site-packages\pydap\parsers\__init__.py", line 182, in consume
raise Exception("Unable to parse token: %s" % self.buffer[:10])
Exception: Unable to parse token: <!DOCTYPE
However, per your suggestion I'm able to get data with NETCDF4.
Here's what I did.
>>> import netCDF4
>>> nc = netCDF4.Dataset('http://www.ncei.noaa.gov/thredds/dodsC/gfs-g4-anl-files/201612/20161201/gfsanl_4_20161201_0000_003.grb2')
>>> nc.variables.keys()
odict_keys(['LatLon_Projection', 'lat', 'lon', 'reftime', 'time', 'time_bounds',
...
'v-component_of_wind_altitude_above_msl', 'v-component_of_wind_height_above_ground', 'v-component_of_wind_tropopause', 'v-component_of_wind_sigma'])
That seems to work. Not sure whats wrong with pydap.

Python trouble with JSON objects schema to parse iTunes id lookup

I am trying to parse the information about applications form the itunes lookup tool for example https://itunes.apple.com/lookup?id=880047117.
Right now I am trying to open up a connection using requests, JSON and a JSON objects schema. However the schema keeps failing with the error:
Traceback (most recent call last):
File "parse_and_query.py", line 33, in <module>
details = get_app_details(880047117)
File "/usr/lib/python3.5/site-packages/jsonobjects/schema.py", line 87, in wrapper
return self.parse(func(*args, **kwargs))
File "/usr/lib/python3.5/site-packages/jsonobjects/fields.py", line 169, in parse
return self.run_validation(value)
File "/usr/lib/python3.5/site-packages/jsonobjects/fields.py", line 136, in run_validation
is_empty, value = self.validate_empty_values(value)
File "/usr/lib/python3.5/site-packages/jsonobjects/fields.py", line 105, in validate_empty_values
self.fail('required')
File "/usr/lib/python3.5/site-packages/jsonobjects/fields.py", line 165, in fail
raise ValidationError(msg.format(**kwargs), self.field_name)
jsonobjects.exceptions.ValidationError: ['This field is required.']
The schema is declared as it's own class object and instantiated but it continually fails. I have the ID's of apps that I would like to look up the JSON information on. If there is an easier way that I am missing please let me know, I don't have access to the iTunes API.
#!/usr/local/bin/python3.5
import json
import requests
import jsonobjects as jo
from jsonschema import Draft4Validator
class iTunesAppSchema(jo.Schema):
id = jo.IntegerField('trackId')
url = jo.Field('trackViewUrl')
name = jo.StringField('trackName')
rating = jo.FloatField('averageUserRating')
reviews = jo.IntegerField('userRatingCountForCurrentVersion')
version = jo.StringField('version')
bundle_id = jo.StringField('bundleId')
publisher_id = jo.IntegerField('artistId')
publisher_url = jo.Field('artistViewUrl')
publisher_name = jo.StringField('artistName')
categories = jo.ListField('genres', child=jo.StringField())
parser = iTunesAppSchema('results[0]')
#parser.as_decorator
def get_app_details(app_id):
url = 'https://itunes.apple.com/lookup?id={}'
return requests.get(url.format(app_id)).json()
# https://itunes.apple.com/lookup?id=880047117
details = get_app_details(880047117)
print(details)

Categories

Resources