BulkIndexError while using elasticsearch with Python - python

I am using the following code to load some data on elasticsearch in a Python module:
def loadNewsDataToElasticsearch():
if not es:
print('Elasticsearch server not accessible.')
#Delete all previous data
print('Clearing old data from Elasticsearch server.')
indices = ['articles']
if es.indices.exists(index='articles'):
es.delete_by_query(index=indices, body={"query": {"match_all": {}}})
print('Loading new data from csv file to dataframe.')
df = pd.read_csv(directory+"elasticsearch/dataset/newsDataSmall.csv",index_col=0)
dataset = {'title': df['title'], 'link': df['link'], 'date': df.index}
# Elasticsearch bulk buffer
buffer = []
rows = 0
#Testing
#print(dataset['title'][0])
print('Inserting new data to Elasticsearch.')
for title, link, date in zip(dataset['title'],dataset['link'],dataset['date']):
# Article record
article = {"_id": rows, "_index": "articles", "title": title, "link": link, "date": date}
# Buffer article
buffer.append(article)
# Increment number of articles processed
rows += 1
# Bulk load every 1000 records
if rows % 1000 == 0:
helpers.bulk(es, buffer)
buffer = []
print("Inserted {} articles".format(rows), end="\r")
if buffer:
helpers.bulk(es, buffer)
print("Total articles inserted: {}".format(rows))
#return(es)
The above code runs perfectly fine when I use it on Windows OS in Anaconda. However, when I am running the same code on Linux server, I get the following error:
loadNewsDataToElasticsearch()
Clearing old data from Elasticsearch server.
Loading new data from csv file to dataframe.
Inserting new data to Elasticsearch.
Traceback (most recent call last):
File "", line 1, in
File "/home/priyanka/final5-refined-pyc3.9.15/nlpfnews/semanticSearch.py", line 145, in loadNewsDataToElasticsearch
helpers.bulk(es, buffer)
File "/home/priyanka/miniconda3/lib/python3.9/site-packages/elasticsearch/helpers/actions.py", line 524, in bulk
for ok, item in streaming_bulk(
File "/home/priyanka/miniconda3/lib/python3.9/site-packages/elasticsearch/helpers/actions.py", line 438, in streaming_bulk
for data, (ok, info) in zip(
File "/home/priyanka/miniconda3/lib/python3.9/site-packages/elasticsearch/helpers/actions.py", line 355, in _process_bulk_chunk
yield from gen
File "/home/priyanka/miniconda3/lib/python3.9/site-packages/elasticsearch/helpers/actions.py", line 274, in _process_bulk_chunk_success
raise BulkIndexError(f"{len(errors)} document(s) failed to index.", errors)
elasticsearch.helpers.BulkIndexError: 500 document(s) failed to index.
I am stuck with this for the past few days and have not been able to resolve it. Any help or suggestions will be appreciated.

Related

Pymongo ignoring allowDiskUse = True

I've looked at the other answers to this question, and yet it is still not working. I am trying to delete duplicate cases, here is the function:
def deleteDups(datab):
col = db[datab]
pipeline = [
{'$group': {
'_id': {
'CASE NUMBER': '$CASE NUMBER',
'JURISDICTION': '$JURISDICTION'},#needs to be case insensitive
'count': {'$sum': 1},
'ids': {'$push': '$_id'}
}
},
{'$match': {'count': {'$gt': 1}}},
]
results = col.aggregate(pipeline, allowDiskUse = True)
count = 0
for result in results:
doc_count = 0
print(result)
it = iter(result['ids'])
next(it)
for id in it:
deleted = col.delete_one({'_id': id})
count += 1
doc_count += 1
#print("API call recieved:", deleted.acknowledged) debug, is the database recieving requests
print("Total documents deleted:", count)
And yet, every time, I get this traceback:
File "C:\Users\*****\Documents\GitHub\*****\controller.py", line 202, in deleteDups
results = col.aggregate(pipeline, allowDiskUse = True)
File "C:\Python38\lib\site-packages\pymongo\collection.py", line 2375, in aggregate
return self._aggregate(_CollectionAggregationCommand,
File "C:\Python38\lib\site-packages\pymongo\collection.py", line 2297, in _aggregate
return self.__database.client._retryable_read(
File "C:\Python38\lib\site-packages\pymongo\mongo_client.py", line 1464, in _retryable_read
return func(session, server, sock_info, slave_ok)
File "C:\Python38\lib\site-packages\pymongo\aggregation.py", line 136, in get_cursor
result = sock_info.command(
File "C:\Python38\lib\site-packages\pymongo\pool.py", line 603, in command
return command(self.sock, dbname, spec, slave_ok,
File "C:\Python38\lib\site-packages\pymongo\network.py", line 165, in command
helpers._check_command_response(
File "C:\Python38\lib\site-packages\pymongo\helpers.py", line 159, in _check_command_response
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: Exceeded memory limit for $group, but didn't allow external sort. Pass allowDiskUse:true to opt in.
I asterisked out bits of path to protect privacy. But it is driving me absolutely nuts that this line: results = col.aggregate(pipeline, allowDiskUse = True) very explicitly passes allowDiskUse = True, and Mongo is just ignoring it. If I misspelled something, I'm blind. True has to be capitalized to pass a bool in python.
I feel like I'm going crazy here.
According to the documentation:
Atlas Free Tier and shared clusters do not support the allowDiskUse option for the aggregation command or its helper method.
(Thanks to Shane Harvey for this info)

Python code breaks when attemting to download larger zipped csv file, works fine on smaller file

while working with small zipfiles(about 8MB) containg 25MB of CSV files the below code works exactly as it should. As soon as I attempt to download larger files (45MB zip file containing a 180MB csv) the code breaks and I get the following error message:
(venv) ufulu#ufulu awr % python get_awr_ranking_data.py
https://api.awrcloud.com/v2/get.php?action=get_topsites&token=REDACTED&project=REDACTED Client+%5Bw%5D&fileName=2017-01-04-2019-10-09
Traceback (most recent call last):
File "get_awr_ranking_data.py", line 101, in <module>
getRankingData(project['name'])
File "get_awr_ranking_data.py", line 67, in getRankingData
processRankingdata(rankDateData['details'])
File "get_awr_ranking_data.py", line 79, in processRankingdata
domain.append(row.split("//")[-1].split("/")[0].split('?')[0])
AttributeError: 'float' object has no attribute 'split'
My goal is to download data for 170 projects and save the data to sqlite DB.
Please bear with me me as I am a novice in the field of programming and python. I would greatly appreciate any help to fixing the code below as well as any other sugestions and improvements to making the code more robust and pythonic.
Thanks in advance
from dotenv import dotenv_values
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from sqlalchemy import create_engine
# SQL Alchemy setup
engine = create_engine('sqlite:///rankingdata.sqlite', echo=False)
# Excerpt from the initial API Call
data = {'projects': [{'name': 'Client1',
'id': '168',
'frequency': 'daily',
'depth': '5',
'kwcount': '80',
'last_updated': '2019-10-01',
'keywordstamp': 1569941983},
{
"depth": "5",
"frequency": "ondemand",
"id": "194",
"kwcount": "10",
"last_updated": "2019-09-30",
"name": "Client2",
"timestamp": 1570610327
},
{
"depth": "5",
"frequency": "ondemand",
"id": "196",
"kwcount": "100",
"last_updated": "2019-09-30",
"name": "Client3",
"timestamp": 1570610331
}
]}
#setup
api_url = 'https://api.awrcloud.com/v2/get.php?action='
urls = [] # processed URLs
urlbacklog = [] # URLs that didn't return a downloadable File
# API Call to recieve URL containing downloadable zip and csv
def getRankingData(project):
action = 'get_dates'
response = requests.get(''.join([api_url, action]),
params=dict(token=dotenv_values()['AWR_API'],
project=project))
response = response.json()
action2 = 'topsites_export'
rankDateData = requests.get(''.join([api_url, action2]),
params=dict(token=dotenv_values()['AWR_API'],
project=project, startDate=response['details']['dates'][0]['date'], stopDate=response['details']['dates'][-1]['date'] ))
rankDateData = rankDateData.json()
print(rankDateData['details'])
urls.append(rankDateData['details'])
processRankingdata(rankDateData['details'])
# API Call to download and unzip csv data and process it in pandas
def processRankingdata(url):
content = requests.get(url)
# {"response_code":25,"message":"Export in progress. Please come back later"}
if "response_code" not in content:
f = ZipFile(BytesIO(content.content))
#print(f.namelist()) to get all filenames in Zip
with f.open(f.namelist()[0], 'r') as g: rankingdatadf = pd.read_csv(g)
rankingdatadf = rankingdatadf[rankingdatadf['Search Engine'].str.contains("Google")]
domain = []
for row in rankingdatadf['URL']:
domain.append(row.split("//")[-1].split("/")[0].split('?')[0])
rankingdatadf['Domain'] = domain
rankingdatadf['Domain'] = rankingdatadf['Domain'].str.replace('www.', '')
rankingdatadf = rankingdatadf.drop(columns=['Title', 'Meta description', 'Snippet', 'Page'])
print(rankingdatadf['Search Engine'][0])
writeData(rankingdatadf)
else:
urlbacklog.append(url)
pass
# Finally write the data to database
def writeData(rankingdatadf):
table_name_from_file = project['name']
check = engine.has_table(table_name_from_file)
print(check) # boolean
if check == False:
rankingdatadf.to_sql(table_name_from_file, con=engine)
print(project['name'] + ' ...Done')
else:
print(project['name'] + ' ... already in DB')
for project in data['projects']:
getRankingData(project['name'])
The problem seems to be the split call on a float and not necessarily the download. Try changing line 79
from
domain.append(row.split("//")[-1].split("/")[0].split('?')[0])
to
domain.append(str(str(str(row).split("//")[-1]).split("/")[0]).split('?')[0])
It looks like you're trying to parse the network location portion of the URL here, you can also use urllib.parse to make this easier instead of chaining all the splits:
from urllib.parse import urlparse
...
for row in rankingdatadf['URL']:
domain.append(urlparse(row).netloc)
I think a malformed URL is causing you issues, try (to diagnose issue):
try :
for row in rankingdatadf['URL']:
try:
domain.append(urlparse(row).netloc)
catch Exception:
exit(row)
Looks like you figured it out above, you have a database entry with a NULL value for the URL field. Not sure what your fidelity requirements for this data set are but might want to enforce database rules for URL field, or use pandas to drop rows where URL is NaN.
rankingdatadf = rankingdatadf.dropna(subset=['URL'])

UnicodeDecodeError for md5 id bulk importing data into elasticsearch

I have written a simple python script to import data into elasticsearch using bulk API.
# -*- encoding: utf-8 -*-
import csv
import datetime
import hashlib
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dateutil.relativedelta import relativedelta
ORIGINAL_FORMAT = '%y-%m-%d %H:%M:%S'
INDEX_PREFIX = 'my-log'
INDEX_DATE_FORMAT = '%Y-%m-%d'
FILE_ADDR = '/media/zeinab/ZiZi/Elastic/python/elastic-test/elasticsearch-import-data/sample_data/sample.csv'
def set_data(input_file):
with open(input_file) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
sendtime = datetime.datetime.strptime(row['sendTime'].split('.')[0], ORIGINAL_FORMAT)
yield {
"_index": '{0}-{1}_{2}'.format(
INDEX_PREFIX,
sendtime.replace(day=1).strftime(INDEX_DATE_FORMAT),
(sendtime.replace(day=1) + relativedelta(months=1)).strftime(INDEX_DATE_FORMAT)),
"_type": 'data',
'_id': hashlib.md5("{0}{1}{2}{3}{4}".format(sendtime, row['IMSI'], row['MSISDN'], int(row['ruleRef']), int(row['sponsorRef']))).digest(),
"_source": {
'body': {
'status': int(row['status']),
'sendTime': sendtime
}
}
}
if __name__ == "__main__":
es = Elasticsearch(['http://{0}:{1}'.format('my.host.ip.addr', 9200)])
es.indices.delete(index='*')
success, _ = bulk(es, set_data(FILE_ADDR))
This comment helped me on writing/using set_data method.
Unfortunately I get this exception:
/usr/bin/python2.7 /media/zeinab/ZiZi/Elastic/python/elastic-test/elasticsearch-import-data/import_bulk_data.py
Traceback (most recent call last):
File "/media/zeinab/ZiZi/Elastic/python/elastic-test/elasticsearch-import-data/import_bulk_data.py", line 59, in <module>
success, _ = bulk(es, set_data(source_file))
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/helpers/__init__.py", line 257, in bulk
for ok, item in streaming_bulk(client, actions, **kwargs):
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/helpers/__init__.py", line 180, in streaming_bulk
client.transport.serializer):
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/helpers/__init__.py", line 60, in _chunk_actions
action = serializer.dumps(action)
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/serializer.py", line 50, in dumps
raise SerializationError(data, e)
elasticsearch.exceptions.SerializationError: ({u'index': {u'_type': 'data', u'_id': '8\x1dI\xa2\xe9\xa2H-\xa6\x0f\xbd=\xa7CY\xa3', u'_index': 'my-log-2017-04-01_2017-05-01'}}, UnicodeDecodeError('utf8', '8\x1dI\xa2\xe9\xa2H-\xa6\x0f\xbd=\xa7CY\xa3', 3, 4, 'invalid start byte'))
Process finished with exit code 1
I can insert this data into elasticsearch successfully using index API:
es.index(index='{0}-{1}_{2}'.format(
INDEX_PREFIX,
sendtime.replace(day=1).strftime(INDEX_DATE_FORMAT),
(sendtime.replace(day=1) + relativedelta(months=1)).strftime(INDEX_DATE_FORMAT)
),
doc_type='data',
id=hashlib.md5("{0}{1}{2}{3}{4}".format(sendtime, row['IMSI'], row['MSISDN'], int(row['ruleRef']), int(row['sponsorRef']))).digest(),
body={
'status': int(row['status']),
'sendTime': sendtime
}
)
But the issue with index API is that it's very slow; it needs about 2 seconds to import just 50 records. I hoped bulk API would help me on the speed.
According to the hashlib documentation, the digest method will
Return the digest of the data passed to the update() method so far. This is a bytes object of size digest_size which may contain bytes in the whole range from 0 to 255.
So the resulting bytes may not decodeable to unicode.
>>> id_ = hashlib.md5('abc'.encode('utf-8')).digest()
>>> id_
b'\x90\x01P\x98<\xd2O\xb0\xd6\x96?}(\xe1\x7fr'
>>> id_.decode('utf-8')
Traceback (most recent call last):
File "<console>", line 1, in <module>
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x90 in position 0: invalid start byte
The hexdigest method will produce a string as output; from the docs:
Like digest() except the digest is returned as a string object of double length, containing only hexadecimal digits. This may be used to exchange the value safely in email or other non-binary environments.
>>> id_ = hashlib.md5('abc'.encode('utf-8')).hexdigest()
>>> id_
'900150983cd24fb0d6963f7d28e17f72'

How to continue a for loop with a 504 Gateway Timeout error response from an API?

I am using a package called lob to standardize a dataset of addresses I have. I have been receiving a 504 gateway error after running through a few thousand addresses. The response error from Lob is the following:
.......Traceback (most recent call last):
File "verify_modified_v2.py", line 82, in <module>
zip_code=row['zip_code'],
File "C:\Users\******\Anaconda2\lib\site-packages\lob\resource.py", line 123, in create
response = requestor.request('post', cls.endpoint, params)
File "C:\Users\******\Anaconda2\lib\site-packages\lob\api_requestor.py", line 84, in request
requests.post(lob.api_base + url, auth=(self.api_key, ''), data=data, files=files, headers=headers)
File "C:\Users\******\Anaconda2\lib\site-packages\lob\api_requestor.py", line 27, in parse_response
resp.content, resp.status_code, resp)
lob.error.APIConnectionError: {
"error": {
"message": "GATEWAY_TIMEOUT",
"status_code": 504
}
}
I have tried to except this error so that my code can repeatedly contact lob until it can get through without the gateway error:
for idx, row in enumerate(input_csv):
try:
verifiedAddress = lob.USVerification.create(
primary_line=row['primary_line'],
secondary_line=row['secondary_line'],
city=row['city'],
state=row['state'],
zip_code=row['zip_code'],
)
if verifiedAddress.deliverability in success_deliverabilities:
success_csv.writerow({
'primary_line': verifiedAddress.primary_line,
'secondary_line': verifiedAddress.secondary_line,
'urbanization': verifiedAddress.urbanization,
'last_line': verifiedAddress.last_line,
'deliverability': verifiedAddress.deliverability,
'identifier': row['identifier'],
'status': row['2']
})
else:
failure_csv.writerow({
'primary_line': row['primary_line'],
'secondary_line': row['secondary_line'],
'city': row['city'],
'state': row['state'],
'zip_code': row['zip_code'],
'deliverability': verifiedAddress.deliverability,
'identifier': row['identifier'],
'status': row['2']
})
# Print success
sys.stdout.write('.')
sys.stdout.flush()
# New lines for larger csv's
if idx % 10 is 9:
sys.stdout.write('\n')
sys.stdout.flush()
except lob.error.APIConnectionError:
print "caught error"
It does not seem that the gateway error is able to be "excepted"; does anyone have any thoughts on a way around this?
My end goal is this:
Bypass the error.
Log the error.
Continue with the next row of the csv file.
Thanks.

Filter items newer than 1 hour with RethinkDB and Python

I have a Python-script gathering some metrics and saving them to RethinkDB. I have also written a small Flask-application to display the data on a dashboard.
Now I need to run a query to find all rows in a table newer than 1 hour. This is what I got so far:
tzinfo = pytz.timezone('Europe/Oslo')
start_time = tzinfo.localize(datetime.now() - timedelta(hours=1))
r.table('metrics').filter( lambda m:
m.during(start_time, r.now())
).run(connection)
When I try to visit the page I get this error message:
ReqlRuntimeError: Not a TIME pseudotype: `{
"listeners": "6469",
"time": {
"$reql_type$": "TIME",
"epoch_time": 1447581600,
"timezone": "+01:00"
}
}` in:
r.table('metrics').filter(lambda var_1:
var_1.during(r.iso8601('2015-11-18T12:06:20.252415+01:00'), r.now()))
I googled a bit and found this thread which seems to be a similar problem: https://github.com/rethinkdb/rethinkdb/issues/4827, so I revisited how I add new rows to the database as well to see if that was the issue:
def _fix_tz(timestamp):
tzinfo = pytz.timezone('Europe/Oslo')
dt = datetime.strptime(timestamp[:-10], '%Y-%m-%dT%H:%M:%S')
return tzinfo.localize(dt)
...
for row in res:
... remove some data, manipulate some other data ...
r.db('metrics',
{'time': _fix_tz(row['_time']),
...
).run(connection)
'_time' retrieved by my data collection-script contains some garbage I remove, and then create a datetime-object. As far as I can understand from the RethinkDB documentation I should be able to insert these directly, and if I use "data explorer" in RethinkDB's Admin Panel my rows look like this:
{
...
"time": Sun Oct 25 2015 00:00:00 GMT+02:00
}
Update:
I did another test and created a small script to insert data and then retrieve it
import rethinkdb as r
conn = r.connect(host='localhost', port=28015, db='test')
r.table('timetests').insert({
'time': r.now(),
'message': 'foo!'
}).run(conn)
r.table('timetests').insert({
'time': r.now(),
'message': 'bar!'
}).run(conn)
cursor = r.table('timetests').filter(
lambda t: t.during(r.now() - 3600, r.now())
).run(conn)
I still get the same error message:
$ python timestamps.py
Traceback (most recent call last):
File "timestamps.py", line 21, in <module>
).run(conn)
File "/Users/tsg/.virtualenv/p4-datacollector/lib/python2.7/site-packages/rethinkdb/ast.py", line 118, in run
return c._start(self, **global_optargs)
File "/Users/tsg/.virtualenv/p4-datacollector/lib/python2.7/site-packages/rethinkdb/net.py", line 595, in _start
return self._instance.run_query(q, global_optargs.get('noreply', False))
File "/Users/tsg/.virtualenv/p4-datacollector/lib/python2.7/site-packages/rethinkdb/net.py", line 457, in run_query
raise res.make_error(query)
rethinkdb.errors.ReqlQueryLogicError: Not a TIME pseudotype: `{
"id": "5440a912-c80a-42dd-9d27-7ecd6f7187ad",
"message": "bar!",
"time": {
"$reql_type$": "TIME",
"epoch_time": 1447929586.899,
"timezone": "+00:00"
}
}` in:
r.table('timetests').filter(lambda var_1: var_1.during((r.now() - r.expr(3600)), r.now()))
I finally figured it out. The error is in the lambda-expression. You need to use .during() on a specific field. If not the query will try to wrestle the whole row/document into a timestamp
This code works:
cursor = r.table('timetests').filter(
lambda t: t['time'].during(r.now() - 3600, r.now())
).run(conn)

Categories

Resources