Dynamically setting scrapy request call back - python

I'm working with scrapy. I want to rotate proxies on a per request basis and get a proxy from an api I have that returns a single proxy. My plan is to make a request to the api, get a proxy, then use it to set the proxy based on :
http://stackoverflow.com/questions/39430454/making-request-to-api-from-within-scrapy-function
I have the following:
class ContactSpider(Spider):
name = "contact"
def parse(self, response):
....
PR = Request(
'my_api'
headers=self.headers,
meta={'newrequest': Request(url_to_scrape, headers=self.headers),},
callback=self.parse_PR
)
yield PR
def parse_PR(self, response):
newrequest = response.meta['newrequest']
proxy_data = response.body
newrequest.meta['proxy'] = 'http://'+proxy_data
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING
yield newrequest
def form_output(self, response):
open_in_browser(response)
but I'm getting:
Traceback (most recent call last):
File "C:\twisted\internet\defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "C:\scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "C:\scrapy\core\downloader\handlers\http11.py", line 60, in download_request
return agent.download_request(request)
File "C:\scrapy\core\downloader\handlers\http11.py", line 255, in download_request
agent = self._get_agent(request, timeout)
File "C:\scrapy\core\downloader\handlers\http11.py", line 235, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
File "C:\scrapy\core\downloader\webclient.py", line 37, in _parse
return _parsed_url_args(parsed)
File "C:\scrapy\core\downloader\webclient.py", line 20, in _parsed_url_args
host = b(parsed.hostname)
File "C:\scrapy\core\downloader\webclient.py", line 17, in <lambda>
b = lambda s: to_bytes(s, encoding='ascii')
File "C:\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got NoneType
what am I doing wrong?

The stacktrace info suggests Scrapy has encountered a request object whose url is None, which is expected to be of string type.
These two lines in your code:
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING
would not work as expected, since method Request.replace returns a new instance instead of modifying the original request in-place.
You would need something like this:
newrequest = newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest = newrequest.replace(callback= self.form_output) #TESTING
or simply:
newrequest = newrequest.replace(
url='http://ipinfo.io/ip',
callback=self.form_output
)

Related

Locust put request is randomly failing

My locust file has put requests but sometimes it's passing sometimes failing. Can anyone explain why that's happening?
Here is my locust file
def _generate_put_data(self) -> str:
if len(self.ids) > 0:
teacher_name = self.generate_random_string()
teacher_email = f"{self.generate_random_string()}#{self.generate_random_string()}.{self.generate_random_string()}"
teacher_email = teacher_email.replace("#", "%40")
teacher_id = str(random.choice(self.ids))
request_string = f"{self.path_all}/{teacher_id}?teacherName={teacher_name}&teacherEmail={teacher_email}"
return request_string
#task(2)
def put_request(self):
self.client.put(url=self._generate_put_data())
...
_generate_put_data method returns a string, which is a query.
Here is the error:
[2023-01-26 11:39:12,357] pop-os/ERROR/locust.user.task: expected string or bytes-like object
Traceback (most recent call last):
File "/home/XXX/.local/lib/python3.10/site-packages/locust/user/task.py", line 347, in run
self.execute_next_task()
File "/home/XXX/.local/lib/python3.10/site-packages/locust/user/task.py", line 372, in execute_next_task
self.execute_task(self._task_queue.pop(0))
File "/home/XXX/.local/lib/python3.10/site-packages/locust/user/task.py", line 493, in execute_task
task(self.user)
File "/home/XXX/Desktop/my-projects/spring-boot-app/performans-testing/locust.py", line 44, in put_request
self.client.put(url=self._generate_put_data())
File "/home/XXX/.local/lib/python3.10/site-packages/requests/sessions.py", line 647, in put
return self.request("PUT", url, data=data, **kwargs)
File "/home/XXX/.local/lib/python3.10/site-packages/locust/clients.py", line 131, in request
url = self._build_url(url)
File "/home/XXX/.local/lib/python3.10/site-packages/locust/clients.py", line 81, in _build_url
if absolute_http_url_regexp.match(path):
TypeError: expected string or bytes-like object

Python: TypeError: inet_aton() argument 1 must be str, not None

I am working with a mobile game API and a Telegram Bot. It worked when I put in a fixed Clantag, but now I wanted to let the user write a tag and add it to the link. The app then should search for the clan and get the right stats. Everything is ok but I get this message now and cannot find my mistake. Would be happy if you could help!
def main():
last_update_id = None
message = ""
while True:
updates = get_updates(last_update_id)
if len(updates["result"]) > 0:
last_update_id = get_last_update_id(updates) + 1
message = get_last_update_Message(updates)
clan_stats(updates, message)
def get_last_update_Message(updates):
message = ""
for update in updates["result"]:
message = update["message"]
return message["text"]
def clan_stats(updates, ID):
#https://api.royaleapi.com/clan/1RLU78YU
Link = '"https://api.royaleapi.com/clan/' + ID + '"'
r=requests.get(Link, headers={"Accept":"application/json",
"authorization":"Bearer TOKENHERE"})
clan = r.json()
Full Traceback:
Traceback (most recent call last):
File "/home/Lee63225/clashroyaleclanbot.py", line 188, in <module>
main()
File "/home/Lee63225/clashroyaleclanbot.py", line 184, in main
clan_stats(updates, message)
File "/home/Lee63225/clashroyaleclanbot.py", line 80, in clan_stats
"authorization":"Bearer TOKENHERE"})
File "/usr/lib/python3.7/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 503, in request
prep.url, proxies, stream, verify, cert
File "/usr/lib/python3.7/site-packages/requests/sessions.py", line 676, in merge_environment_settings
env_proxies = get_environ_proxies(url, no_proxy=no_proxy)
File "/usr/lib/python3.7/site-packages/requests/utils.py", line 760, in get_environ_proxies
if should_bypass_proxies(url, no_proxy=no_proxy):
File "/usr/lib/python3.7/site-packages/requests/utils.py", line 716, in should_bypass_proxies
if is_ipv4_address(parsed.hostname):
File "/usr/lib/python3.7/site-packages/requests/utils.py", line 640, in is_ipv4_address
socket.inet_aton(string_ip)
TypeError: inet_aton() argument 1 must be str, not None
Thank you!
I think it should rather be
Link = 'https://api.royaleapi.com/clan/' + ID
There is some surrounding " in your attempt.
But now as I look, the function is called as clan_stats(update,message). That "message" should be a clantag, make sure it is (as now it comes from get_last_update_Message(), and looks very suspicious.

"'str' object has no attribute 'get'" when using Google Cloud Storage with ScrapingHub

I'm trying to get Google Cloud Storage working with a Scrapy Cloud + Crawlera project so that I can save text files I'm trying to download. I'm encountering an error when I run my script that seems to have to do with my Google permissions not working properly.
Error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/media.py", line 68, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 325, in from_settings
return cls(store_uri, settings=settings)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 289, in __init__
self.store = self._get_store(store_uri)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 333, in _get_store
return store_cls(uri)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 217, in __init__
client = storage.Client(project=self.GCS_PROJECT_ID)
File "/app/python/lib/python3.7/site-packages/google/cloud/storage/client.py", line 82, in __init__
project=project, credentials=credentials, _http=_http
File "/app/python/lib/python3.7/site-packages/google/cloud/client.py", line 228, in __init__
Client.__init__(self, credentials=credentials, _http=_http)
File "/app/python/lib/python3.7/site-packages/google/cloud/client.py", line 133, in __init__
credentials, _ = google.auth.default()
File "/app/python/lib/python3.7/site-packages/google/auth/_default.py", line 305, in default
credentials, project_id = checker()
File "/app/python/lib/python3.7/site-packages/google/auth/_default.py", line 165, in _get_explicit_environ_credentials
os.environ[environment_vars.CREDENTIALS])
File "/app/python/lib/python3.7/site-packages/google/auth/_default.py", line 102, in _load_credentials_from_file
credential_type = info.get('type')
AttributeError: 'str' object has no attribute 'get'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/local/lib/python3.7/site-packages/scrapy/core/engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/core/scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/utils/misc.py", line 140, in create_instance
return objcls.from_crawler(crawler, *args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/media.py", line 70, in from_crawler
pipe = cls()
TypeError: __init__() missing 1 required positional argument: 'store_uri'
__init__.py where I create the credentials file:
# Code from https://medium.com/#rutger_93697/i-thought-this-solution-was-somewhat-complex-3e8bc91f83f8
import os
import json
import pkgutil
import logging
path = "{}/google-cloud-storage-credentials.json".format(os.getcwd())
credentials_content = '<escaped JSON data>'
with open(path, "w") as text_file:
text_file.write(json.dumps(credentials_content))
logging.warning("Path to credentials: %s" % path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = path
settings.py:
BOT_NAME = 'get_case_urls'
SPIDER_MODULES = ['get_case_urls.spiders']
NEWSPIDER_MODULE = 'get_case_urls.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Crawlera
DOWNLOADER_MIDDLEWARES = {'scrapy_crawlera.CrawleraMiddleware': 300}
CRAWLERA_ENABLED = True
CRAWLERA_APIKEY = '<crawlera-api-key>'
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 600
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 500
}
FILES_STORE = 'gs://<name-of-my-gcs-project>'
IMAGES_STORE = 'gs://<name-of-my-gcs-project>'
GCS_PROJECT_ID = "<id-of-my-gcs-project>"
After looking at the code for _load_credentials_from_file it seems to me that I had not saved the JSON to a text file correctly: in __init__.py, rather than having text_file.write(json.dumps(credentials_content)), I should have had text_file.write(credentials_content) or text_file.write(json.dumps(json.loads(credentials_content))).

Bulk index to elasticsearch using python

I have following code which indexes data to elasticsearch using python,
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import requests
from requests.auth import AuthBase
requests.packages.urllib3.disable_warnings()
class TokenAuth(AuthBase):
def __init__(self, token):
self.token = token
def __call__(self, r):
r.headers['Authorization :Bearer'] = f'{self.token}'
return r
es = Elasticsearch('https://localhost:9200/user/type',ca_certs=False,verify_certs=False,auth=TokenAuth(''))
#requests.get('https://httpbin.org/get', auth=TokenAuth('12345abcde-token'))
res = helpers.bulk(es, "ldif2.json", chunk_size=1, request_timeout=200)
It follows token based autheentication , but whan i run this progam i get below error message ,how do i solve this.
Traceback (most recent call last):
File "bulk_index.py", line 20, in <module>
res = helpers.bulk(es, "ldif2.json", chunk_size=1, request_timeout=200)
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\helpers\actions.py", line 300, in bulk
for ok, item in streaming_bulk(client, actions, *args, **kwargs):
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\helpers\actions.py", line 230, in streaming_bulk
**kwargs
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\helpers\actions.py", line 116, in _process_bulk_chunk
raise e
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\helpers\actions.py", line 112, in _process_bulk_chunk
resp = client.bulk("\n".join(bulk_actions) + "\n", *args, **kwargs)
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\client\utils.py", line 84, in _wrapped
return func(*args, params=params, **kwargs)
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\client\__init__.py", line 1498, in bulk
headers={"content-type": "application/x-ndjson"},
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\transport.py", line 353, in perform_request
timeout=timeout,
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\connection\http_urllib3.py", line 239, in perform_request
self._raise_error(response.status, raw_data)
File "C:\Users\mkumaru\AppData\Local\Programs\Python\Python37\lib\site-packages\elasticsearch\connection\base.py", line 168, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.AuthenticationException: AuthenticationException(401, 'Access denied')```
I think es should be like this.
es = Elasticsearch("http://127.0.0.1:9200", http_auth=('user', 'passwd'))

python-instagram OAuthPermissionsException

So I'm trying to get Instagram photos that fit certain parameters and I'm getting the following stack:
Traceback (most recent call last):
File "instagram_find_shows.py", line 83, in <module>
if __name__ == "__main__": main()
File "instagram_find_shows.py", line 48, in main
get_instagram_posts(show_name, show_time, coordinates)
File "instagram_find_shows.py", line 73, in get_instagram_posts
str(coordinates[1]), min_time, max_time)
File "C:\Users\User Name\Anaconda3\lib\site-packages\instagram\bind.py", line 197, in _call
return method.execute()
File "C:\Users\User Name\Anaconda3\lib\site-packages\instagram\bind.py", line 189, in execute
content, next = self._do_api_request(url, method, body, headers)
File "C:\Users\User Name\Anaconda3\lib\site-packages\instagram\bind.py", line 163, in _do_api_request
raise InstagramAPIError(status_code, content_obj['meta']['error_type'], content_obj['meta']['error_message'])
instagram.bind.InstagramAPIError: (400) OAuthPermissionsException-This request requires scope=public_content, but this access token is not authorized with this scope. The user must re-authorize your application with scope=public_content to be granted this permissions.
The code is as follows:
def get_instagram_posts(name, time, coordinates):
max_time_dt = time + timedelta(hours=3)
min_time_dt = time - timedelta(hours=1)
max_time = str(calendar.timegm(max_time_dt.timetuple()))
min_time = str(calendar.timegm(min_time_dt.timetuple()))
dist_rad_str = str(insta_dist_radius_m)
count_str = str(insta_count)
api = InstagramAPI(access_token=insta_access_token,
client_secret=insta_client_secret)
r = api.media_search(name, count_str, str(coordinates[0]),
str(coordinates[1]), min_time, max_time)
photos = []
for media in r:
photos.append('<img src="%s"/>' % media.images['thumbnail'].url)
print(photos[0])
I can't figure out what to do... Literally I'm just trying to do a simple test, not trying to cripple their API. Is there any way to do this within Instagram's parameters? Thanks so much!
Fixed by going to the following URL in the browser:
https://www.instagram.com/oauth/authorize?client_id=[CLIENT_ID]&redirect_uri=[REDIRECT_URI]&response_type=code&scope=basic+public_content+follower_list+comments+relationships+likes

Categories

Resources