I am working on a woocommerce rest api in which I am passing the product data from mongodb using python. I wrote a script for the same. First time the script ran successfully and passed two products but when I tried to pass more products it is failing. I don't know where I am doing it wrong.
Showing this error:
requests.exceptions.ReadTimeout: HTTPConnectionPool(host='localhost', port=80): Read timed out. (read timeout=5)
script:
from woocommerce import API
import os, sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from portal_v2.settings import _MONGO_DB_URI
from pymongo import MongoClient
import os, csv, ast
wcapi = API(
url="http://localhost/wordpress1",
consumer_key="ck_******************************************",
consumer_secret="cs_******************************************",
wp_api=True,
version="wc/v1"
)
class Products(object):
def __init__(self, dbname):
_client = MongoClient(_MONGO_DB_URI)
self._database = _client[dbname]
self.getCollections()
self.dbname = dbname
def getCollections(self):
self.products = self._database['products']
def getProducts(self):
product_dict = []
for each in self.products.find().limit(10):
product_dict.append({"name": each['name'],"slug": each['name'].replace(' ','').replace('/','') + each['sku'].replace(' ','').replace('/',''),"type": "simple","status": "publish","featured": False,"catalog_visibility": "visible","description": each['description'],"sku": each['sku'],"regular_price": each['mrp'],"sale_price": each['cost_price'],"date_on_sale_from": "","date_on_sale_to": "","purchasable": True,"total_sales": 0,"virtual": False,"downloadable": False,"downloads": [],"download_limit": -1,"download_expiry": -1,"download_type": "standard","external_url": "","button_text": "","tax_status": "taxable","tax_class": "","manage_stock": False,"stock_quantity": None,"in_stock": True,"backorders": "no","backorders_allowed": False,"backordered": False, "sold_individually": False, "weight": each['weight_gms'],"dimensions": {"length": each['length_mm'],"width": each['width_mm'],"height": each['height_mm']},"shipping_required": True,"shipping_taxable": True,"shipping_class": "", "shipping_class_id": 0,"reviews_allowed": True,"average_rating": "0.00","rating_count": 0,"related_ids": [],"upsell_ids": [],"cross_sell_ids": [],"parent_id": 0,"purchase_note": "","categories": [{"id": 9,},{"id": 14,}],"tags": [],"images": [{"src": each['image_url'].replace('dl=0','raw=1'),"name": each['name'],"position": 0}],"attributes": [],"default_attributes": [],"variations": [],"menu_order": 0})
data = {'create':product_dict}
print data
return data
def updateProducts(self, data):
wcapi.post("products/batch", data)
# print wcapi.get("products/?per_page=45").json()
data = Products('xyz').getProducts()
Products('xyz').updateProducts(data)
where 'xyz' is the database name.
Just increase the timeout option. something like:
wcapi = API(
url="http://localhost/wordpress1",
consumer_key="ck_******************************************",
consumer_secret="cs_******************************************",
wp_api=True,
version="wc/v1",
timeout=10 # the default is 5, increase to whatever works for you.
)
Related
I am using
adf_client.pipeline_runs.query_by_factory(resourceGroupName,
factoryName, filter_parameters)
method of azure.mgmt.datafactory.DataFactoryManagementClient package to fetch ADF Pipeline Run details.
The response of above function returns 100 pipeline run records at once. Along with the response, it returns continuation_token which, I believe, is supposed to be used to fetch next set/page of records.
I am not sure which function to be used for this. I tried using azure.mgmt.datafactory.models.PipelineRun() function (trial and error) to see if that satisfies the requirement. Unfortunately, it doesn't. MS Documentation is also very abstract to understand.
So, which function in Azure's Python SDK can be used to fetch next page of run records?
from azure.common.credentials import ServicePrincipalCredentials
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.datafactory import DataFactoryManagementClient
from azure.mgmt.datafactory.models import *
from datetime import datetime, timedelta
from azure.identity import ClientSecretCredential
subscription_id = "b83c1wd3-xxxx-xxxx-xxxx-2b83a074c23f"
rg_name = "My-rg"
df_name = "ktestadf"
tenant_id = "12f978bf-xxxx-xxxx-xxxx-2d7cd011db47"
client_id = "a71ad3ca-xxxx-xxxx-xxxx-af0c2a3fdae1"
client_secret = "Nym7Q~j5YMyxxxxxx3tAk879y9vLrxAQqaI8n"
credential = ClientSecretCredential(
tenant_id=tenant_id ,
client_id=client_id ,
client_secret=client_secret)
adf_client = DataFactoryManagementClient( credential=credential,
subscription_id=subscription_id)
pipe_run = []
dsb = datetime.now()
dsa = dsb - timedelta(hours = 24)
filter_params = RunFilterParameters(last_updated_after=dsa, last_updated_before=dsb)
pipeline_runs = adf_client.pipeline_runs.query_by_factory(resource_group_name=rg_name, factory_name=df_name, filter_parameters = filter_params)
pipe_run.append(pipeline_runs.value)
while (pipeline_runs.continuation_token):
filter_params = RunFilterParameters(last_updated_after=dsa, last_updated_before=dsb,continuation_token = pipeline_runs.continuation_token)
pipeline_runs = adf_client.pipeline_runs.query_by_factory(
resource_group_name=rg_name, factory_name=df_name, filter_parameters=filter_params)
pipe_run.append(pipeline_runs.value)
You get the continuation_token when you have the next page of results, if any remaining results exist, null otherwise.
Here is an example of its usage, however I currently don't have enough pipeline runs to show the token itself.
Now in your case, you have received one, so here is how you can use it.
Considering pipeline_runs is holding the results, pipeline_runs.continuation_token is what we need to get and pass back in another request to get the next page.
Add a simple loop, say a While checking for pipeline_runs.continuation_token exists and requesting next page until the value for token returned is Null - end of the result.
Complete working implementation:
from azure.common.credentials import ServicePrincipalCredentials
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.datafactory import DataFactoryManagementClient
from azure.mgmt.datafactory.models import *
from datetime import datetime, timedelta
from azure.identity import ClientSecretCredential
subscription_id = "b83c1wd3-xxxx-xxxx-xxxx-2b83a074c23f"
rg_name = "My-rg"
df_name = "ktestadf"
tenant_id = "12f978bf-xxxx-xxxx-xxxx-2d7cd011db47"
client_id = "a71ad3ca-xxxx-xxxx-xxxx-af0c2a3fdae1"
client_secret = "Nym7Q~j5YMyxxxxxx3tAk879y9vLrxAQqaI8n"
credentials = ServicePrincipalCredentials(client_id=client_id, secret=client_secret, tenant=tenant_id)
adf_client = DataFactoryManagementClient(credentials, subscription_id)
filter_params = PipelineRunFilterParameters(last_updated_after=datetime.now() - timedelta(30), last_updated_before=datetime.now() + timedelta(1))
pipeline_runs = adf_client.pipeline_runs.query_by_factory(resource_group_name=rg_name, factory_name=df_name, filter_parameters = filter_params)
for pipeline_run in pipeline_runs.value:
print(pipeline_run)
while (pipeline_runs.continuation_token):
pipeline_runs = adf_client.pipeline_runs.query_by_factory(
resource_group_name=rg_name, factory_name=df_name, filter_parameters=filter_params, continuation_token = pipeline_runs.continuation_token)
print(pipeline_runs.value)
You can optionally not print the earlier call pipeline_runs in for loop, I have them just show in code for reference.
I am working on creating custom image in IBM Cloud using python. I have a very simple straight code for just creating the image and it fails.
As per me I am passing the relevant correct details for all the parameters.
Still I get an Error which is not much descriptive :
ERROR:root:Please check whether the resource you are requesting exists.
Traceback (most recent call last):
File "/Users/deepali.mittal/GITHUB/dcoa/python/build/dmittal/virtual-env36/lib/python3.6/site-packages/ibm_cloud_sdk_core/base_service.py", line 246, in send
response.status_code, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: Please check whether the resource you are requesting exists., Code: 400
Process finished with exit code 0
This is not related to the resource missing in COS. As if it is not able to find the image in COS it gives a different error.
Code :
from ibm_vpc import VpcV1 as vpc_client
from ibm_cloud_sdk_core import ApiException
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from boto3 import client as boto3_client
import logging
#logging.basicConfig(level=logging.DEBUG)
SOURCE_OBJECT_PATH = 'cos://us-south/de-images-dmittal/abc.qcow2'
RESOURCE_GROUP_ID = '1234'
OPERATING_SYSTEM = 'ubuntu-16-amd64'
def create_ssm_client():
ssm_client = boto3_client("ssm", region_name="us-west-2")
return ssm_client
def retrieve_ibm_config(ssm_client):
params = ["/ibm/service-key"]
response = ssm_client.get_parameters(Names=params, WithDecryption=True)
try:
api_key = response["Parameters"][0]["Value"]
except (ValueError, IndexError):
raise RuntimeError(
f"Required SSM parameters not retrieved. "
f'Required parameters are: {params}.'
)
return api_key
def create_authenticator(api_key):
authenticator = IAMAuthenticator(api_key)
return authenticator
def create_ibm_client(authenticator):
ibm_client = vpc_client('2021-05-28', authenticator=authenticator)
return ibm_client
def create_image_prototype():
image_file_prototype_model = {'href': SOURCE_OBJECT_PATH}
operating_system_identity_model = {'name': OPERATING_SYSTEM}
resource_group_identity_model = {'id': RESOURCE_GROUP_ID}
image_prototype_model = {
'name': 'my-image',
#'resource_group': resource_group_identity_model,
'file': image_file_prototype_model,
'operating_system': operating_system_identity_model
}
image_prototype = image_prototype_model
return image_prototype
def create_image():
ssm_client = create_ssm_client()
api_key = retrieve_ibm_config(ssm_client)
authenticator = create_authenticator(api_key)
ibm_client = create_ibm_client(authenticator)
image_prototype = create_image_prototype()
try:
#images = ibm_client.list_images()
#print(vpc)
#ibm_client.set_service_url('https://us-south.iaas.cloud.ibm.com/v1')
response = ibm_client.create_image(image_prototype)
print(response)
except ApiException as e:
print("Failed")
if __name__ == "__main__":
create_image()
Issue was with IAM Permission. After fixing it worked, the error shown was not relevant so it took time to figure out
I am looking to fetch and publish data from spark streaming onto cloudant. My code is as follows -
from CloudantPublisher import CloudantPublisher
from CloudantFetcher import CloudantFetcher
import pyspark
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from kafka import KafkaConsumer, KafkaProducer
import json
class SampleFramework():
def __init__(self):
pass
#staticmethod
def messageHandler(m):
return json.loads(m.message)
#staticmethod
def processData(rdd):
if (rdd.isEmpty()):
SampleFramework.logger.info("RDD is empty")
return
# Expand
expanded_rdd = rdd.mapPartitions(CloudantFetcher.fetch)
expanded_rdd.foreachPartition(CloudantPublisher.publish)
def run(self, ssc):
self.ssc = ssc
directKafkaStream = KafkaUtils.createDirectStream(self.ssc, SUBSCRIBE_QUEUE], \
{"metadata.broker.list": METADATA, \
"bootstrap.servers": BOOTSTRAP_SERVERS}, \
messageHandler= SampleFramework.messageHandler)
directKafkaStream.foreachRDD(SampleFramework.processData)
ssc.start()
ssc.awaitTermination()
Other supporting classes -
from CloudantConnector import CloudantConnector
class CloudantFetcher:
config = Config.createConfig()
cloudantConnector = CloudantConnector(config)
#staticmethod
def fetch(data):
final_data = []
for row in data:
id = row["id"]
if(not CloudantFetcher.cloudantConnector.isReady()):
CloudantFetcher.cloudantConnector.open()
data_json = CloudantFetcher.cloudantConnector.getOne({"id": id})
row["data"] = data_json
final_data.append(row)
CloudantFetcher.cloudantConnector.close()
return final_data
class CloudantPublisher:
config = Config.createConfig()
cloudantConnector = CloudantConnector(config)
#staticmethod
def publish(data):
CloudantPublisher.cloudantConnector.open()
CloudantPublisher.cloudantConnector.postAll(data)
CloudantPublisher.cloudantConnector.close()
from cloudant.client import Cloudant
from cloudant.result import Result
from cloudant.result import QueryResult
from cloudant.document import Document
from cloudant.query import Query
from cloudant.database import CloudantDatabase
import json
class CloudantConnector:
def __init__(self, config, db_name):
self.config = config
self.client = Cloudant(self.config["cloudant"]["credentials"]["username"], self.config["cloudant"]["credentials"]["password"], url=self.config["cloudant"]["host"]["full"])
self.initialized = False
self.db_name = self.config["cloudant"]["host"]["db_name"]
def open(self):
try:
self.client.connect()
self.logger.info("Connection to Cloudant established.")
self.initialized = True
except:
raise Exception("Could not connect to Cloudant! Please verify credentials.")
self.database = CloudantDatabase(self.client,self.db_name)
if self.database.exists():
pass
else:
self.database.create()
def isReady(self):
return self.initialized
def close(self):
self.client.disconnect()
def getOne(self, query):
new_filter = query
query = Query(self.database, selector = query, limit=1)
results_string = json.dumps(query.result[0][0])
results_json = json.loads(results_string)
return results_json
def postAll(self, docs):
documents = []
quantum = self.config["cloudant"]["constants"]["bulk_quantum"]
count = 0
for doc in docs:
document = Document(self.database)
document["id"] = doc["id"]
document["data"] = doc["data"]
documents.append(document)
count = count + 1
if(count%quantum==0):
self.database.bulk_docs(documents)
documents = []
if(len(documents)!=0):
self.database.bulk_docs(documents)
self.logger.debug("Uploaded document to the Cloudant database.")
My implementation works, but it's slow as compared to what I would expect in the case of not initializing the cloudant connection in each partition and maintaining a static source of these connection threads which can be passed on to each partition to use/ fetched by each partition to use.
My Questions are as follows:
Do I need to create a connection pool with cloudant 2.0 API in python? (It seems that it already exists within the API). If yes, then how should I go about it? The closest I have seen an implementation is this - link, but it's on an outdated cloudant api and does not work with the new one.
If the answer to the above is 'Yes', How can I make this accessible to the workers? I see references to creating serializable, lazily instantiated connection-client objects here. This would mean that I would make a lazily instantiated cloudant connection object in the SampleFramework. How can I do this in Python? Just like given in the spark documentation.
connection = ConnectionPool.getConnection()
for record in iter:
connection.send(record)
ConnectionPool.returnConnection(connection)
If the above is not possible, how do I speed up my operations? The only alternative I can think off is maintaining a single connection on the driver program, collecting the data from all workers and then fetching/uploading the same. This would decrease the number of times I need to connect to cloudant, but would take away the distributed fetching/publishing architecture.
So what I'm trying to do is go through a 5Gb xml file of products for a website and eventually add the data to a datastore. I'm just playing around with queues now and my idea was to create a queue that will read through the file line by line and take every 50 products and send them to another queue to be processed (eventually to the datastore). I'm testing this on a much smaller xml file. My problem is within OpenFileQueue, it's creating a queue even when the conditions "if ((self.count % 50) == 0):" have not been met. Any ideas on what might be going on? Or ideas on better ways to read through this file. It feels like a bad hack the way im doing it now. The test file im using has around 170 products when I run the code as it is now and call /gcs I end up with about 86 queues. Not sure what is going on here.
import webapp2
import os
import datetime
import time
from lxml import etree
import sys
import codecs
import time
import gc
import logging
from google.appengine.ext import db
from google.appengine.api import search
import cloudstorage as gcs
from google.appengine.api import taskqueue
my_default_retry_params = gcs.RetryParams(initial_delay=0.2,
max_delay=5.0,
backoff_factor=2,
max_retry_period=15)
gcs.set_default_retry_params(my_default_retry_params)
logging.getLogger().setLevel(logging.DEBUG)
class GoogleCloudStorage(webapp2.RequestHandler):
def get(self):
bucket = '/newegg-catalog'
self.response.headers['Content-Type'] = 'text/plain'
self.tmp_filenames_to_clean_up = []
filename = bucket + '/ndd.xml'
taskqueue.add(url='/openfile', params={'filename': filename})
self.redirect('/')
class AddFileParts(webapp2.RequestHandler):
def post(self):
data = self.request.get('data')
logging.debug('PROCESSING %s', data)
class OpenFileQueue(webapp2.RequestHandler):
def __init__(self, request, response):
self.initialize(request, response)
self.Plist = []
self.masterList = []
self.count = 0
def post(self):
filename = self.request.get('filename')
logging.debug('Opening file %s', filename)
gcs_file = gcs.open(filename)
while True:
line = gcs_file.readline()
self.Plist.append(line)
if line.strip()=="</product>":
self.masterList.append(self.Plist)
self.Plist = []
self.count+=1
if ((self.count % 50) == 0):
logging.debug('Starting queue of items up to %s with 50 items', self.count)
taskqueue.add(url='/adddata', params={'data': self.masterList})
self.masterList = []
if line.strip()=="</catalog>":
break
gcs_file.close()
app = webapp2.WSGIApplication([('/adddata',AddFileParts),
('/openfile', OpenFileQueue),
('/gcs', GoogleCloudStorage)],
debug=True)
When a line matches "</product>", it appends to self.masterlist and increments self.count (eventually to 50). But if the next line is not "</product>", the count will still be 50 and add another task to the queue.
Instead, use the length of self.masterList because it is reset after being added to the queue:
if len(self.masterList) >= 50:
logging.debug('Starting queue of items up to %s with 50 items', len(self.masterList))
taskqueue.add(url='/adddata', params={'data': self.masterList})
self.masterList = []
and remove all references to self.count.
I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))