'Request' is not defined-python 3

'Request' is not defined-python 3 - python

Global name 'Request' is not defined.
#!/usr/bin/env python
import BeautifulSoup
import requests
link = ''
# sitekey retrieval
def get_sitekey():
captcha_page = Request(link, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/56.0.2924.28 Safari/537.36'})
product_page = urlopen(captcha_page)
soup = BeautifulSoup(product_page, 'html.parser')
sitekey = soup.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
print(sitekey)
if __name__ == '__main__':
get_sitekey()

You need to access the Request object from within the request module.
#!/usr/bin/env python
import BeautifulSoup
import requests
link = ''
# sitekey retrieval
def get_sitekey():
captcha_page = requests.Request(link, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/56.0.2924.28 Safari/537.36'})
product_page = urlopen(captcha_page)
soup = BeautifulSoup(product_page, 'html.parser')
sitekey = soup.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
print(sitekey)
if __name__ == '__main__':
get_sitekey()

Related

Need help setting up parsing from a text document. Python3

This is a sample code. You need to get the url from the request with txt
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15;rv:84.0) Gecko/20100101 Firefox/84.0",}
page = requests.get('https://duckduckgo.com/html/?q=test', headers=headers).text
soup = BeautifulSoup(page, 'html.parser').find_all("a", class_="result__url", href=True)
for link in soup:
print(link['href'])

You can use f-strings
search_text = "foo"
page = requests.get(f'https://duckduckgo.com/html/?q={search_text}', headers=headers).text

import requests, argparse, ScrapeSearchEngine, time, threading
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0",
}
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dorks", help="Your dorks list", required=True)
args = parser.parse_args()
with open(args.dorks, 'r') as f:
dorks = [line.strip('\n') for line in f]
scraped = 0
for dork in dorks:
if os.name == "nt":
os.system('title SQLI Crawler ^| Dork: '+str(dork)+' ^| Scraped Links: '+str(scraped))
search = (dork)
page = requests.get(f'https://duckduckgo.com/html/?q={search_text}', headers=headers).text
soup = BeautifulSoup(page, 'html.parser').find_all("a", class_="result__url", href=True)
for link in soup:
print(link['href'])

Added new changes. I can't add the number of pages in search and save to links.txt
import os, requests, argparse, colorama, ScrapeSearchEngine, time, threading
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0",
}
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dorks", help="Your dorks list", required=True)
args = parser.parse_args()
with open(args.dorks, 'r') as f:
dorks = [line.strip('\n') for line in f]
scraped = 0
for dork in dorks:
if os.name == "nt":
os.system('title SQLI Crawler ^| Dork: '+str(dork)+' ^| Scraped Links: '+str(scraped))
search = (dork)
page = requests.get(f'https://duckduckgo.com/html/?q={search}', headers=headers).text
soup = BeautifulSoup(page, 'html.parser').find_all("a", class_="result__url", href=True)
for link in ScrapedLinks:
scraped += 1
open('links.txt', 'a+').write(link+"\n")
print(f"[{Fore.CYAN}{time.strftime('%H:%M:%S')}{Fore.RESET}] [{Fore.YELLOW}INFO{Fore.RESET}] "+link)
if args.scan == 'true':
threading.Thread(target=scanner, args=(link, )).start()

How to get the value of a "hidden" href?

I'm working with web scraping to, at first, collect the total pages. I have tested the code I made for another site and however I am having a problem getting the next page link (href).
Here's the code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
page_number = 1
url = "https://www.last.fm/user/"+userName+"/library/artists?page="
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link["href"])
page_number += 1
As you can see, the href of this site presents the link as "?page=2", which does not allow me to get its content (https://www.last.fm/user/brendanm1975/library/artists?page=2).
I've already inspected the variables, and I'm getting the values.
print(url) # output: https://www.last.fm/user/brendanm1975/library/artists?page=
next_link.find('a').get('href') # output: '?page=2'
Does anyone know how to get around this?

What happens?
You try to urljoin(url, next_link["href"]) but next_link do not have an attribute href cause you are selecting the <li> not the <a>.
How to fix?
Option#1 - Just select the <a> in your urljoin():
url = urljoin(url, next_link.a["href"])
Option#2 - Select the <a> directly:
next_link = soup.select_one('li.pagination-next a')
Example
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
url = "https://www.last.fm/user/"+userName+"/library/artists?page=1"
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link.a["href"])
Output
['https://www.last.fm/user/brendanm1975/library/artists?page=1',
'https://www.last.fm/user/brendanm1975/library/artists?page=2',
'https://www.last.fm/user/brendanm1975/library/artists?page=3',
'https://www.last.fm/user/brendanm1975/library/artists?page=4',
'https://www.last.fm/user/brendanm1975/library/artists?page=5',
'https://www.last.fm/user/brendanm1975/library/artists?page=6',
'https://www.last.fm/user/brendanm1975/library/artists?page=7',
'https://www.last.fm/user/brendanm1975/library/artists?page=8',
'https://www.last.fm/user/brendanm1975/library/artists?page=9',
'https://www.last.fm/user/brendanm1975/library/artists?page=10',
'https://www.last.fm/user/brendanm1975/library/artists?page=11',
'https://www.last.fm/user/brendanm1975/library/artists?page=12',
'https://www.last.fm/user/brendanm1975/library/artists?page=13',
'https://www.last.fm/user/brendanm1975/library/artists?page=14',
'https://www.last.fm/user/brendanm1975/library/artists?page=15',
'https://www.last.fm/user/brendanm1975/library/artists?page=16',
'https://www.last.fm/user/brendanm1975/library/artists?page=17',
'https://www.last.fm/user/brendanm1975/library/artists?page=18',...]

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱　電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス（グレー）",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.

I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

AttributeError: 'unicode' object has no attribute 'fromstring'. How to get around this?

I'm trying to detect the availability of an item on Amazon. Why doesn't this code work?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
When I execute it, this error appears:
File "scra.py", line 17, in check
doc = html.fromstring(page.content)
AttributeError: 'unicode' object has no attribute 'fromstring'
Please help me. I already tried converting page to pagedata = page.json() but it only made it worse.

Try using this instead of html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()

python unicode char from requests/bs4

i have a script to get the lyrics of a song from metrolyrics using requests and bs4
the problem is that when i print it it show something like this (part of the lyrics)
Rabbi, Papa, Allah, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, Vino, KashÃ¨r, á¸¤alÄl, Yom Kippur, Quaresima, Ramadan
when it should look like this
Rabbi, Papa, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, vino, kashèr, ḥalāl, Yom Kippur, Quaresima, Ramadan
code i use
import requests
from bs4 import BeautifulSoup
import os
try:
from urllib.parse import quote_plus
except ImportError:
from urllib import quote_plus
def get_lyrics(song_name):
song_name += ' metrolyrics'
name = quote_plus(song_name)
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11'
'(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
url = 'http://www.google.com/search?q=' + name
result = requests.get(url, headers=hdr).text
link_start = result.find('http://www.metrolyrics.com')
if(link_start == -1):
return("Lyrics not found on Metrolyrics")
link_end = result.find('html', link_start + 1)
link = result[link_start:link_end + 4]
lyrics_html = requests.get(link, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel'
'Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/55.0.2883.95 Safari/537.36'
}
).text
soup = BeautifulSoup(lyrics_html, "lxml")
raw_lyrics = (soup.findAll('p', attrs={'class': 'verse'}))
paras = []
try:
final_lyrics = unicode.join(u'\n', map(unicode, raw_lyrics))
except NameError:
final_lyrics = str.join(u'\n', map(str, raw_lyrics))
final_lyrics = (final_lyrics.replace('<p class="verse">', '\n'))
final_lyrics = (final_lyrics.replace('<br/>', ' '))
final_lyrics = final_lyrics.replace('</p>', ' ')
return (final_lyrics)
i have tried with .encode('utf-8') .encode('unicode-escape') and the reconverting again but no solution
i have another script where i use musixmatch api and there it show the unicode correct

I did small changes in get_lyrics function:
return final_lyrics.encode('latin1').decode('utf-8')
and got desired output:
# python2
print get_lyrics('kashèr')
...
Rabbi, Papa, Allah, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, Vino, Kashèr, Ḥalāl, Yom Kippur, Quaresima, Ramadan
...

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

'Request' is not defined-python 3 - python

Related

Need help setting up parsing from a text document. Python3

How to get the value of a "hidden" href?

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

AttributeError: 'unicode' object has no attribute 'fromstring'. How to get around this?

python unicode char from requests/bs4

Categories

Resources