Scrapyd jobs not starting

Scrapyd jobs not starting - python

I integrated scrapy in my Django project following this guide
Unfortunately, In any way I try, the spider jobs are not starting, even if schedule.json gives me a jobid in return.
My views:
#csrf_exempt
#api_view(['POST'])
def crawl_url(request):
url = request.POST.get('url', None) # takes url from request
if not url:
return JsonResponse({'error': 'Missing args'})
if not is_valid_url(url):
return JsonResponse({'error': 'URL is invalid'})
domain = urlparse(url).netloc # parse the url and extract the domain
unique_id = str(uuid4()) # creates a unique ID.
# Custom settings for scrapy spider.
# We can send anything we want to use it inside spiders and pipelines.
settings = {
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# Schedule a new crawling task from scrapyd.
# settings is a special argument name.
# This returns an ID which belongs to this task, used to check the task status
task = scrapyd.schedule('default', 'kw_spider', settings=settings, url=url, domain=domain)
return JsonResponse({'task_id': task, 'unique_id': unique_id, 'status': 'started'})
#csrf_exempt
#api_view(['GET'])
def get_crawl_data(request):
task_id = request.GET.get('task_id', None)
unique_id = request.GET.get('unique_id', None)
if not task_id or not unique_id:
return JsonResponse({'error': 'Missing args'})
# Check status of crawling
# If finished, makes query from database and get results
# If not, return active status
# Possible results are -> pending, running, finished
status = scrapyd.job_status('default', task_id)
if status == '' or status is None:
return JsonResponse({
'status': 'error',
'data': 'Task not found'
})
elif status == 'finished':
try:
item = ScrapyItem.objects.get(unique_id=unique_id)
return JsonResponse({
'status': status,
'data': item.to_dict['data']
})
except Exception as e:
return JsonResponse({
'status': 'error',
'data': str(e)
})
else:
return JsonResponse({
'status': status,
'data': {}
})
My spider:
class KwSpiderSpider(CrawlSpider):
name = 'kw_spider'
def __init__(self, *args, **kwargs):
# __init__ overridden to have a dynamic spider
# args passed from django views
self.url = kwargs.get('url')
self.domain = kwargs.get('domain')
self.start_urls = [self.url]
self.allowed_domains = [self.domain]
KwSpiderSpider.rules = [
Rule(LinkExtractor(unique=True), callback='parse_item'),
]
super(KwSpiderSpider, self).__init__(*args, **kwargs)
def parse_item(self, response):
resp_dict = {
'url': response.url
}
# resp_dict['domain_id'] = response.xpath('//input[#id="sid"]/#value').extract()
# resp_dict['name'] = response.xpath('//div[#id="name"]').extract()
# resp_dict['description'] = response.xpath('//div[#id="description"]').extract()
return resp_dict
I also tried with a curl call
curl http://localhost:6800/schedule.json -d project=default -d spider=kw_spider
which gave me the following response:
{"node_name": "9jvtf82", "status": "ok", "jobid": "0ca057026e5611e8898f64006a668b22"}
But nothing happens, the job doesn't start

I solved it by noticing an error in the scrapyd console log.
I was missing the pywin32 library, though I don't understand why this wasn't in the requirements.
A simple
pip install pywin32
fixed it

Related

difficulty setting up consumer test pact python

Im trying to set up a consumer test with Pact, but Im struggling. If someone could help me where Im going wrong it would be appreciated.
The file I am trying to test is as follows:
import requests
from orders_service.exceptions import (
APIIntegrationError,
InvalidActionError
)
class OrderItem:
def __init__(self, id, product, quantity, size):
self.id = id
self.product = product
self.quantity = quantity
self.size = size
def dict(self):
return {
'product': self.product,
'size': self.size,
'quantity': self.quantity
}
class Order:
def __init__(self, id, created, items, status, schedule_id=None,
delivery_id=None, order_=None):
self._order = order_
self._id = id
self._created = created
self.items = [OrderItem(**item) for item in items]
self.status = status
self.schedule_id = schedule_id
self.delivery_id = delivery_id
#property
def id(self):
return self._id or self._order.id
#property
def created(self):
return self._created or self._order.created
#property
def status(self):
return self._status or self._order.status
def cancel(self):
if self.status == 'progress':
response = requests.get(
f'http://localhost:3001/kitchen/schedule/{self.schedule_id}/cancel',
data={'order': self.items}
)
if response.status_code == 200:
return
raise APIIntegrationError(
f'Could not cancel order with id {self.id}'
)
if self.status == 'delivery':
raise InvalidActionError(f'Cannot cancel order with id {self.id}')
def pay(self):
response = requests.post(
'http://localhost:3001/payments', data={'order_id': self.id}
)
if response.status_code == 200:
return
raise APIIntegrationError(
f'Could not process payment for order with id {self.id}'
)
def schedule(self):
response = requests.post(
'http://localhost:3000/kitchen/schedule',
data={'order': [item.dict() for item in self.items]}
)
if response.status_code == 201:
return response.json()['id']
raise APIIntegrationError(
f'Could not schedule order with id {self.id}'
)
def dict(self):
return {
'id': self.id,
'order': [item.dict() for item in self.items],
'status': self.status,
'created': self.created,
}
The consumer test I just can't get it to stage where it is publishing the contract. There are 2 areas Im not too familiar with firstly the python fixture. Im really unsure what needs to go here or how to do that and lastly the "consumer.cancel()" at the very bottom of the test.
Some help getting me set up and one the way would be greatly appreciated. Here is what I wrote for the test:
import atexit
from datetime import datetime
import logging
import os
from uuid import UUID
import requests
import pytest
import subprocess
from pact import Consumer, Like, Provider, Term, Format
from orders_service.orders import Order, OrderItem
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# If publishing the Pact(s), they will be submitted to the Pact Broker here.
# For the purposes of this example, the broker is started up as a fixture defined
# in conftest.py. For normal usage this would be self-hosted or using Pactflow.
PACT_BROKER_URL = "https://xxx.pactflow.io/"
PACT_BROKER_USERNAME = xxx
PACT_BROKER_PASSWORD = xxx
# Define where to run the mock server, for the consumer to connect to. These
# are the defaults so may be omitted
PACT_MOCK_HOST = "localhost"
PACT_MOCK_PORT = 1234
# Where to output the JSON Pact files created by any tests
PACT_DIR = os.path.dirname(os.path.realpath(__file__))
#pytest.fixture
def consumer() -> Order.cancel:
# return Order.cancel("http://{host}:{port}".format(host=PACT_MOCK_HOST, "port=PACT_MOCK_PORT))
order = [OrderItem(**{"id":1, "product":"coffee", "size":"big", "quantity":2})]
payload = Order(id=UUID, created=datetime.now, items=order, status="progress")
return Order.cancel(payload)
#pytest.fixture(scope="session")
def pact(request):
"""Setup a Pact Consumer, which provides the Provider mock service. This
will generate and optionally publish Pacts to the Pact Broker"""
# When publishing a Pact to the Pact Broker, a version number of the Consumer
# is required, to be able to construct the compatability matrix between the
# Consumer versions and Provider versions
# version = request.config.getoption("--publish-pact")
# publish = True if version else False
pact = Consumer("UserServiceClient", version=1).has_pact_with(
Provider("UserService"),
host_name=PACT_MOCK_HOST,
port=PACT_MOCK_PORT,
pact_dir=PACT_DIR,
publish_to_broker=True,
broker_base_url=PACT_BROKER_URL,
broker_username=PACT_BROKER_USERNAME,
broker_password=PACT_BROKER_PASSWORD,
)
pact.start_service()
# Make sure the Pact mocked provider is stopped when we finish, otherwise
# port 1234 may become blocked
atexit.register(pact.stop_service)
yield pact
# This will stop the Pact mock server, and if publish is True, submit Pacts
# to the Pact Broker
pact.stop_service()
# Given we have cleanly stopped the service, we do not want to re-submit the
# Pacts to the Pact Broker again atexit, since the Broker may no longer be
# available if it has been started using the --run-broker option, as it will
# have been torn down at that point
pact.publish_to_broker = False
def test_cancel_scheduled_order(pact, consumer):
expected = \
{
"id": "1e54e244-d0ab-46ed-a88a-b9e6037655ef",
"order": [
{
"product": "coffee",
"quantity": 1,
"size": "small"
}
],
"scheduled": "Wed, 22 Jun 2022 09:21:26 GMT",
"status": "cancelled"
}
(pact
.given('A scheduled order exists and it is not cancelled already')
.upon_receiving('a request for cancellation')
.with_request('get', f'http://localhost:3001/kitchen/schedule/{Like(12343)}/cancel')
.will_respond_with(200, body=Like(expected)))
with pact:
payload = Order(UUID, datetime.now, {"product":"coffee", "size":"large", "quantity":1}, "progress")
print(payload)
response = consumer.cancel(payload)
assert response['status'] == "cancelled"
pact.verify()
Also I originally had(adapted from the example in pact):
# return Order.cancel("http://{host}:{port}".format(host=PACT_MOCK_HOST, "port=PACT_MOCK_PORT))
but i'm not sure how that works
Thanks for helping me

There are a couple of issues here:
.with_request('get', f'http://localhost:3001/kitchen/schedule/{Like(12343)}/cancel')
The Like matcher is a function that returns an object. Adding this within a string is likely to cause issues when it is stringified
You don't need to put the protocol and host portion here - just the path e.g.:
.with_request(method='GET', path='/kitchen/schedule/bc72e917-4af1-4e39-b897-1eda6d006b18/cancel', headers={'Content-Type': 'application/json'} ...)
If you want to use a matcher on the path, it needs to be on the string as a whole e.g. Regex('/kitchen/schedule/([0-9]+)/cancel') (this is not a real regex, but hopefully you get the idea).
I can’t see in this code where it calls the actual mock service. I’ve removed the commented items for readability:
(pact
.given('A scheduled order exists and it is not cancelled already')
.upon_receiving('a request for cancellation')
.with_request(method='GET', path='/kitchen/schedule/bc72e917-4af1-4e39-b897-1eda6d006b18/cancel', headers={'Content-Type': 'application/json'},)
.will_respond_with(200, body=Like(expected)))
with pact:
# this needs to be sending a request to
# http://localhost:1234/kitchen/schedule/bc72e917-4af1-4e39-b897-1eda6d006b18/cancel
response = consumer.cancel()
pact.verify()
The definition of the function you are calling doesn't make any HTTP request to the pact mock service, it just returns a canned response.
#pytest.fixture
def consumer() -> Order.cancel:
# return Order.cancel("http://{host}:{port}".format(host=PACT_MOCK_HOST, "port=PACT_MOCK_PORT))
order = [OrderItem(**{"id":1, "product":"coffee", "size":"big", "quantity":2})]
payload = Order(id=UUID, created=datetime.now, items=order, status="progress")
return Order.cancel(payload)
For a Pact test to pass, you need to demonstrate your code actually calls the correct HTTP endpoints with the right data, and that your code can handle it.

Checking the payment status from an payment APi

Am trying to verify user transaction on paystack. After a user makes payment, I want what to append the reference to the Api URL to check if the payment was succcessful. If the payment is successful then save the model.
import requests
from django.conf import settings
class Paystack:
PAYSTACK_SECRET_KEY = "sk_test_3cd83d64a1de3a7334bdad47e3fdfa01bf16a059"
base_url = "https://api.paystack.co"
def verify_payment(self, reference, *args, **kwargs):
path = f'/transaction/verify/{reference}'
headers ={
"Authorization": f"Bearer {self.PAYSTACK_SECRET_KEY}",
"Content-Type":'application/json'
}
url = self.base_url + path
response = requests.get(url, headers=headers)
if response.status_code == 200:
response_data = response.json()
return response_data['status'], response_data['data']
response_data = response.json()
return response_data["status"], response_data["message"]
def process_payment(request, slug, amount, award, votes):
reason = request.GET.get('reason')
transaction_id = request.GET.get('reference')
amount = (str(int(amount) / 100))
paystack = Paystack()
status = paystack.process_payment(self.reference)
if status == "success":
transaction = SuccessfulTransactionHistory(
nominee_name=slug,
transaction_id=transaction_id,
amount=amount,
award=award
)
transaction.save()
Nomination.objects.filter(slug=slug).update(votes=F('votes') + votes)
Award.objects.filter(slug=award).update(amount=F('amount') + amount)
return redirect('vote:paymentsuccess', slug=slug)
else:
context = {
'error': reason
}
transaction = FailedTransactionHistory(
nominee_name=slug,
transaction_id=transaction_id,
amount=amount,
award=award
)
transaction.save()
return render(request, 'payment_error.html', context=context)
This is the eeror i get
AttributeError at /payment/Paul/000000000020/halotech-award-8/1/
'Paystack' object has no attribute 'process_payment'

verify_payment vs process_payment.
def verify_payment(self, reference, *args, **kwargs):
status = paystack.process_payment(self.reference)

stripe does not save if transaction is successful in flask

I am trying to save some information in database if transaction is successful, in the stripe_webhook view. But not successful. is it that data cannot be saved directly in webhook ? so frustrating for me. I checked online for sample codes but could not find the ones that insert or update database for successful transaction.
from site.models import Post, Chapter, Order
import stripe
from sqlalchemy import desc
#posts.route("/paynow")
#login_required
def paynow():
return render_template('paynow.html',)
#posts.route('/stripe_pay')
#login_required
def stripe_pay():
amt = 10000
stripe.api_key = current_app.config['STRIPE_SECRET_KEY']
session = stripe.checkout.Session.create(
payment_method_types=['card'],
line_items=[{
'price_data': {
'currency': 'usd',
'product_data': {
'name': 'T-shirt',
},
'unit_amount': amt,
},
'quantity': 1,
}],
mode='payment',
success_url=url_for('posts.payment_success', _external=True) + '?session_id={CHECKOUT_SESSION_ID}',
cancel_url=url_for('posts.paynow', _external=True),
)
return {
'checkout_session_id': session['id'],
'checkout_public_key': current_app.config['STRIPE_PUBLIC_KEY']
}
#posts.route('/stripe_webhook', methods=['POST'])
#login_required
def stripe_webhook():
print('WEBHOOK CALLED')
if request.content_length > 1024 * 1024:
print('REQUEST TOO BIG')
abort(400)
payload = request.get_data()
sig_header = request.environ.get('HTTP_STRIPE_SIGNATURE')
endpoint_secret = 'whsec_*************************************'
event = None
try:
event = stripe.Webhook.construct_event(
payload, sig_header, endpoint_secret
)
except ValueError as e:
# Invalid payload
print('INVALID PAYLOAD')
return {}, 400
except stripe.error.SignatureVerificationError as e:
# Invalid signature
print('INVALID SIGNATURE')
return {}, 400
# Handle the checkout.session.completed event
if event['type'] == 'checkout.session.completed':
session = event['data']['object']
print(session)
line_items = stripe.checkout.Session.list_line_items(session['id'], limit=1)
print(line_items['data'][0]['description'])
# save to database if successful
save_order = Order(trans_id = "pppppppppp")
db.session.add(save_order)
db.session.commit()
return {}
#posts.route('/payment_success')
#login_required
def payment_success():
cart=Cart.query.filter_by(username = current_user.username).all()
cart.status = "paid"
db.session.commit()
return render_template('payment_success.html')

According to the Stripe customer care, I must used a live domain instead of local host

aiohttp - before request for each API call

When I was using Flask, every API call is authenticated before processed:
app = connexion.App(__name__, specification_dir='./swagger/', swagger_json=True, swagger_ui=True, server='tornado')
app.app.json_encoder = encoder.JSONEncoder
app.add_api('swagger.yaml', arguments={'title': 'ABCD API'})
# add CORS support
CORS(app.app)
#app.app.before_request
def before_request_func():
app_id = request.headers.get("X-AppId")
token = request.headers.get("X-Token")
user, success = security.Security().authorize(token)
if not success:
status_code = 401
response = {
'code': status_code,
'message': 'Unauthorized user'
}
return jsonify(response), status_code
g.user = user
When I changed it to AioHttp, my authentication is not properly setup:
options = {'swagger_path': 'swagger/', "swagger_ui": True}
app = connexion.AioHttpApp(__name__, specification_dir='swagger/', options=options)
app.add_api('swagger.yaml', arguments={'title': ' ABCD API'})
app = web.Application(middlewares=[auth_through_token])
async def auth_through_token(app: web.Application, handler: Any) -> Callable:
#web.middleware
async def middleware_handler(request: web.Request) -> web.Response:
headers = request.headers
x_auth_token = headers.get("X-Token")
app_id = headers.get("X-AppId")
user, success = security.Security().authorize(x_auth_token)
if not success:
return web.json_response(status=401, data={
"error": {
"message": ("Not authorized. Reason: {}"
)
}
})
response = await handler(request)
return response
return middleware_handler
My request is not getting redirected to the API method.
Could anyone please help me to set up, my before_request authentication for every API?
Thanks.

Firstly, you have to move middleware_handler out from auth_through_token.
Then,
Quote your code:
options = {'swagger_path': 'swagger/', "swagger_ui": True}
app = connexion.AioHttpApp(__name__, specification_dir='swagger/', options=options)
app.add_api('swagger.yaml', arguments={'title': ' ABCD API'})
app = web.Application(middlewares=[auth_through_token])
You have to remove the last line and change the first line to:
options = {'swagger_path': 'swagger/', "swagger_ui": True, 'middlewares': [middleware_handler]}
So finally the code should look like:
options = {'swagger_path': 'swagger/', "swagger_ui": True, 'middlewares': [middleware_handler]}
app = connexion.AioHttpApp(__name__, specification_dir='swagger/', options=options)
app.add_api('swagger.yaml', arguments={'title': ' ABCD API'})
#web.middleware
async def middleware_handler(request: web.Request, handler: Any) -> web.Response:
headers = request.headers
x_auth_token = headers.get("X-Token")
app_id = headers.get("X-AppId")
user, success = security.Security().authorize(x_auth_token)
if not success:
return web.json_response(status=401, data={
"error": {
"message": ("Not authorized. Reason: {}"
)
}
})
response = await handler(request)
return response

I need the request in my decorator but I can't figure out how to get it in there

my deorator function...
def validate_captcha(view):
'''Decorator to validate a captcha based on settings'''
def failure():
return HttpResponse('You need to complete the captcha, please refresh and try again')
if request.method == 'POST':
url = "https://www.google.com/recaptcha/api/siteverify"
values = {
'secret': settings.GOOGLE_RECAPTCHA_SECRET_KEY,
'response': request.POST.get(u'g-recaptcha-response', None),
'remoteip': request.META.get("REMOTE_ADDR", None),
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
result = json.loads(response.read())
# result["success"] will be True on a success
if result["success"]:
return view
else:
return fail
return fail
and then my view...
#validate_captcha
def sendemail(request):
...
I could put the request in the decorator args, but then it is undefined when i put it in the view args. I tried calling it a few others ways without success, how would you put it in there?

You need to have a wrapper function:
def validate_captcha(view):
def wrap(request, *args, **kwargs):
if request.method == 'POST':
url = "https://www.google.com/recaptcha/api/siteverify"
values = {
'secret': settings.GOOGLE_RECAPTCHA_SECRET_KEY,
'response': request.POST.get(u'g-recaptcha-response', None),
'remoteip': request.META.get("REMOTE_ADDR", None),
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
result = json.loads(response.read())
# result["success"] will be True on a success
if result["success"]:
return view
else:
return fail
return fail
return wrap
Make sure study this awesome and quite detailed overview on decorators in Python (I personally think, it is one of the best SO answers ever):
https://stackoverflow.com/a/1594484/771848

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapyd jobs not starting - python

I solved it by noticing an error in the scrapyd console log. I was missing the pywin32 library, though I don't understand why this wasn't in the requirements. A simple pip install pywin32 fixed it

Related

difficulty setting up consumer test pact python

Checking the payment status from an payment APi

stripe does not save if transaction is successful in flask

aiohttp - before request for each API call

I need the request in my decorator but I can't figure out how to get it in there

Categories

Resources