Need help setting up parsing from a text document. Python3 - python

This is a sample code. You need to get the url from the request with txt
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15;rv:84.0) Gecko/20100101 Firefox/84.0",}
page = requests.get('https://duckduckgo.com/html/?q=test', headers=headers).text
soup = BeautifulSoup(page, 'html.parser').find_all("a", class_="result__url", href=True)
for link in soup:
print(link['href'])

You can use f-strings
search_text = "foo"
page = requests.get(f'https://duckduckgo.com/html/?q={search_text}', headers=headers).text

import requests, argparse, ScrapeSearchEngine, time, threading
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0",
}
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dorks", help="Your dorks list", required=True)
args = parser.parse_args()
with open(args.dorks, 'r') as f:
dorks = [line.strip('\n') for line in f]
scraped = 0
for dork in dorks:
if os.name == "nt":
os.system('title SQLI Crawler ^| Dork: '+str(dork)+' ^| Scraped Links: '+str(scraped))
search = (dork)
page = requests.get(f'https://duckduckgo.com/html/?q={search_text}', headers=headers).text
soup = BeautifulSoup(page, 'html.parser').find_all("a", class_="result__url", href=True)
for link in soup:
print(link['href'])

Added new changes. I can't add the number of pages in search and save to links.txt
import os, requests, argparse, colorama, ScrapeSearchEngine, time, threading
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0",
}
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dorks", help="Your dorks list", required=True)
args = parser.parse_args()
with open(args.dorks, 'r') as f:
dorks = [line.strip('\n') for line in f]
scraped = 0
for dork in dorks:
if os.name == "nt":
os.system('title SQLI Crawler ^| Dork: '+str(dork)+' ^| Scraped Links: '+str(scraped))
search = (dork)
page = requests.get(f'https://duckduckgo.com/html/?q={search}', headers=headers).text
soup = BeautifulSoup(page, 'html.parser').find_all("a", class_="result__url", href=True)
for link in ScrapedLinks:
scraped += 1
open('links.txt', 'a+').write(link+"\n")
print(f"[{Fore.CYAN}{time.strftime('%H:%M:%S')}{Fore.RESET}] [{Fore.YELLOW}INFO{Fore.RESET}] "+link)
if args.scan == 'true':
threading.Thread(target=scanner, args=(link, )).start()

Related

How to get the value of a "hidden" href?

I'm working with web scraping to, at first, collect the total pages. I have tested the code I made for another site and however I am having a problem getting the next page link (href).
Here's the code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
page_number = 1
url = "https://www.last.fm/user/"+userName+"/library/artists?page="
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link["href"])
page_number += 1
As you can see, the href of this site presents the link as "?page=2", which does not allow me to get its content (https://www.last.fm/user/brendanm1975/library/artists?page=2).
I've already inspected the variables, and I'm getting the values.
print(url) # output: https://www.last.fm/user/brendanm1975/library/artists?page=
next_link.find('a').get('href') # output: '?page=2'
Does anyone know how to get around this?
What happens?
You try to urljoin(url, next_link["href"]) but next_link do not have an attribute href cause you are selecting the <li> not the <a>.
How to fix?
Option#1 - Just select the <a> in your urljoin():
url = urljoin(url, next_link.a["href"])
Option#2 - Select the <a> directly:
next_link = soup.select_one('li.pagination-next a')
Example
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
url = "https://www.last.fm/user/"+userName+"/library/artists?page=1"
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link.a["href"])
Output
['https://www.last.fm/user/brendanm1975/library/artists?page=1',
'https://www.last.fm/user/brendanm1975/library/artists?page=2',
'https://www.last.fm/user/brendanm1975/library/artists?page=3',
'https://www.last.fm/user/brendanm1975/library/artists?page=4',
'https://www.last.fm/user/brendanm1975/library/artists?page=5',
'https://www.last.fm/user/brendanm1975/library/artists?page=6',
'https://www.last.fm/user/brendanm1975/library/artists?page=7',
'https://www.last.fm/user/brendanm1975/library/artists?page=8',
'https://www.last.fm/user/brendanm1975/library/artists?page=9',
'https://www.last.fm/user/brendanm1975/library/artists?page=10',
'https://www.last.fm/user/brendanm1975/library/artists?page=11',
'https://www.last.fm/user/brendanm1975/library/artists?page=12',
'https://www.last.fm/user/brendanm1975/library/artists?page=13',
'https://www.last.fm/user/brendanm1975/library/artists?page=14',
'https://www.last.fm/user/brendanm1975/library/artists?page=15',
'https://www.last.fm/user/brendanm1975/library/artists?page=16',
'https://www.last.fm/user/brendanm1975/library/artists?page=17',
'https://www.last.fm/user/brendanm1975/library/artists?page=18',...]

How to grab some part of the link inside the td tag in python

I'm trying to grab the link inside a td. My code does not display the link or produce the desired output. What I need to change.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from time import sleep
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
urllink = "https://bscscan.com/txs?block=11711353&ps=100&p=1"
reqblockdetails = requests.get(urllink, headers=headers, timeout=5)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr')
sleep(1)
for row in rowsblockdetails[1:]:
txnhash = row.find_all('td')[1].text[0:]
txnhashdetails = txnhash.strip()
destination = row.find_all('td')[8].text[0:]
destination = destination.strip()
if str(destination) == "CoinOne: CONE Token":
urldest = soupblockdetails.find('a', attrs={'class': 'hash-tag text-truncate'}).text
print (" {:>1} {:<5}".format(txnhashdetails, destination))
print (urldest)
else:
pass
Current Output:
0x8265a6ba5ce531df645b883e8735af57241f43e92eb3c9a88f43b89310f964bc CoinOne: CONE Token Validator: Stake2me
Needed Output:
0x8265a6ba5ce531df645b883e8735af57241f43e92eb3c9a88f43b89310f964bc CoinOne: CONE Token 0x9628735017f1a985ebaac0b203efb9e8d3ed0fef
It would be better to search for <a> element in currently selected <td> but not in whole document so I changed code to td = row.find_all('td')[8] and later to td.find('a', ...).
Here is a working code:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from time import sleep
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
urllink = "https://bscscan.com/txs?block=11711353&ps=100&p=1"
reqblockdetails = requests.get(urllink, headers=headers, timeout=5)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr')
sleep(1)
for row in rowsblockdetails[1:]:
txnhash = row.find_all('td')[1].text[0:]
txnhashdetails = txnhash.strip()
td = row.find_all('td')[8]
destination = td.text[0:].strip()
if str(destination) == "CoinOne: CONE Token":
urldest = td.find('a', attrs={'class': 'hash-tag text-truncate'})["href"].lstrip("/address/")
print (" {:>1} {:<5}".format(txnhashdetails, destination))
print (urldest)
else:
pass
Hope, it will work. try this:
t_link = soupblockdetails.find('span', attrs={'class': 'hash-tag text-truncate'})
urldest = t_link.a['href']

Loop page with beautifulsoup

I would scraper urls of player of all pages from this website https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780
but I can scrape only the first one, why?
I write a cicle for with range()
import pandas as pd
import requests
from bs4 import BeautifulSoup
list_url=[]
def get_player_urls(page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
link = 'https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780/page/{page}'
content = requests.get(link, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
for urls in soup.find_all('a', class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.it' + urls.get('href')
print(url)
list_url.append(url)
return
for page in range(1,11,1):
get_player_urls(page)
df_url = pd.DataFrame(list_url)
df_url.to_csv('df_url.csv', index=False, header=False)
You're not actually imputing the page into the url. Also, no need to put return on your function. You aren't returning anything:
import pandas as pd
import requests
from bs4 import BeautifulSoup
list_url=[]
def get_player_urls(page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
link = 'https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780/page/{page}'.format(page=page) #<-- Add this
content = requests.get(link, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
for urls in soup.find_all('a', class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.it' + urls.get('href')
print(url)
list_url.append(url)
for page in range(1,11,1):
get_player_urls(page)
df_url = pd.DataFrame(list_url)
df_url.to_csv('df_url.csv', index=False, header=False)

Beatifulsoup not returning full html of the page

I want to scrape few pages from amazon website like title,url,aisn and i run into a problem that script only parsing 15 products while on the page it is showing 50. i decided to print out all html to console and i saw that the html is ending at 15 products without any errors from the script.
Here is the part of my script
keyword = "men jeans".replace(' ', '+')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords={}".format(keyword)
request = requests.session()
req = request.get(url, headers = headers)
sleep(3)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup)
It's because few of the items are generated dynamically. There might be any better solution other than using selenium. However, as a workaround you can try the below way instead.
from selenium import webdriver
from bs4 import BeautifulSoup
def fetch_item(driver,keyword):
driver.get(url.format(keyword.replace(" ", "+")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
for items in soup.select("[id^='result_']"):
try:
name = items.select_one("h2").text
except AttributeError: name = ""
print(name)
if __name__ == '__main__':
url = "https://www.amazon.com/s/field-keywords={}"
driver = webdriver.Chrome()
try:
fetch_item(driver,"men jeans")
finally:
driver.quit()
Upon running the above script you should get 56 names or something as result.
import requests
from bs4 import BeautifulSoup
for page in range(1, 21):
keyword = "red car".replace(' ', '+')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords=" + keyword + "?page=" + str(page)
request = requests.session()
req = request.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
results = soup.findAll("li", {"class": "s-result-item"})
for i in results:
try:
print(i.find("h2", {"class": "s-access-title"}).text.replace('[SPONSORED]', ''))
print(i.find("span", {"class": "sx-price-large"}).text.replace("\n", ' '))
print('*' * 20)
except:
pass
Amazon's page range is max till 20 here is it crawling the pages

'Request' is not defined-python 3

Global name 'Request' is not defined.
#!/usr/bin/env python
import BeautifulSoup
import requests
link = ''
# sitekey retrieval
def get_sitekey():
captcha_page = Request(link, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/56.0.2924.28 Safari/537.36'})
product_page = urlopen(captcha_page)
soup = BeautifulSoup(product_page, 'html.parser')
sitekey = soup.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
print(sitekey)
if __name__ == '__main__':
get_sitekey()
You need to access the Request object from within the request module.
#!/usr/bin/env python
import BeautifulSoup
import requests
link = ''
# sitekey retrieval
def get_sitekey():
captcha_page = requests.Request(link, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/56.0.2924.28 Safari/537.36'})
product_page = urlopen(captcha_page)
soup = BeautifulSoup(product_page, 'html.parser')
sitekey = soup.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
print(sitekey)
if __name__ == '__main__':
get_sitekey()

Categories

Resources