Unable to crawl multiples URLs - python

I have a function which two crawl the webpage and look for a particular class and find a href tag inside it.
url="https://www.poynter.org/ifcn-covid-19-misinformation/page/220/"
def url_parse(site):
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
return soup
def article_link(URL):
try:
soup=url_parse(URL)
for i in soup.find_all("a", class_="button entry-content__button entry-content__button--smaller"):
link=i['href']
except:
pass
return link
data['article_source']=""
for i, rows in data.iterrows():
rows['article_source']= article_link(rows['url'])
Issue
The function url_parse and article_link are working fine but when I use the function article_link to update the cell inside a datagram, it stops working after 1500 or 1000 URLs. I understand there could be an IP address with my laptop but I don't understand how to solve it because there is no error message.
Expectation
The function article_link parse all URL inside the data frame.

import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url, num):
with requests.Session() as req:
print(f"Extracting Page# {num}")
r = req.get(url.format(num), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.get("href") for item in soup.findAll(
"a", class_="button entry-content__button entry-content__button--smaller")]
return links
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(main, url, num) for num in range(1, 238)]
for future in futures:
print(future.result())

Related

Using Python, BeautifulSoup, CSV to scrape a URL

In this URL https://doc8643.com/aircrafts I want to scrape all rows.
Then for each individual row, for example https://doc8643.com/aircraft/A139
I want to scrape these three areas of data
<table class="table centered-table">
<h4>Manufacturers</h4>
<h4>Technical Data</h4>
Can this is done in python?
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://doc8643.com/aircrafts'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
writer.writerow([c.text if c.text else '' for c in row.select('h3')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
else:
break
You should create function which get value c.text (ie, A139) and creates full url like https://doc8643.com/aircraft/A139 and runs Request or requests and BeautifulSoup to get all needs data
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# ... scrape details and put in list `results` ...
return results
and run it in your loop
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
The biggest problem is to scrape details.
For some details it needs to scrape dl and next all dt and dd and use zip() to group in pairs.
Something like
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
but this need more code - and I skip this part.
Minimal working code
EDIT: I added url = 'https://doc8643.com' + url
import csv
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# --- functions ---
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
# --- main ---
url = 'https://doc8643.com/aircrafts'
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["data1", "data2", "data3", "etc..."])
while True:
print('url:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
url = 'https://doc8643.com' + url
else:
break
BTW:
Maybe it would be better to keep results as dictionary
results[dt.string] = [dd.string]

Loop duplicating results

I'm writing a code for web-scrape Transfermarkt website, but I'm having some issues on the code.
The code had returned an error that was fixed thru the topic: Loop thru multiple URLs in Python - InvalidSchema("No connection adapters were found for {!r}".format
After this fix, other problems came in.
First: the code is duplicating the results on data frame.
Second one, the code is taking only the last element of each URL. In fact, what I want is get all the agencies URLs in the pagina = range(1) and then scrape all players in each agency, thru the URL scrapped in the first part.
ps.: pagina = range(1) it will be range (1,40), its the numbers of pages that i will scrape to get all agency's links.
Can anyone give me a hand on this issues?
Thanks!
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.sessions import default_headers
nome=[]
posicao=[]
nacionalidade=[]
idade=[]
clube=[]
contrato=[]
valor=[]
tf = f"http://www.transfermarkt.com.br"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
pagina = range(1,5)
def main(url):
with requests.Session() as req:
links = []
for lea in pagina:
print(f"Extraindo links da página {lea}")
r = req.get(url.format(lea), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
link = [f"{tf}{item.next_element.get('href')}" for item in soup.findAll(
"td", class_="hauptlink")]
links.extend(link)
print(f"Collected {len(links)} Links")
time.sleep(1)
for url in links:
r= requests.get(url, headers=headers)
r.status_code
soup = BeautifulSoup(r.text, 'html.parser')
player_info= soup.find_all('tr', class_=['odd', 'even'])
for info in player_info:
player = info.find_all("td")
vall= info.find('td', {'class': 'zentriert hauptlink'})
nome.append(player[2].text)
posicao.append(player[3].text)
nacionalidade.append(player[4].img['alt'])
idade.append(player[5].text)
clube.append(player[6].img['alt'])
contrato.append(player[7].text)
valor.append(vall)
time.sleep(1)
df = pd.DataFrame(
{"NOME":nome,
"POSICAO":posicao,
"NACIONALIDADE":nacionalidade,
"IDADE":idade,
"CLUBE":clube,
"CONTRATO":contrato,
"VALOR":valor}
)
print(df)
df
#df.to_csv('MBB.csv', index=False)
main("https://www.transfermarkt.com.br/berater/beraterfirmenuebersicht/berater?ajax=yw1&page={}")

How to get the value of a "hidden" href?

I'm working with web scraping to, at first, collect the total pages. I have tested the code I made for another site and however I am having a problem getting the next page link (href).
Here's the code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
page_number = 1
url = "https://www.last.fm/user/"+userName+"/library/artists?page="
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link["href"])
page_number += 1
As you can see, the href of this site presents the link as "?page=2", which does not allow me to get its content (https://www.last.fm/user/brendanm1975/library/artists?page=2).
I've already inspected the variables, and I'm getting the values.
print(url) # output: https://www.last.fm/user/brendanm1975/library/artists?page=
next_link.find('a').get('href') # output: '?page=2'
Does anyone know how to get around this?
What happens?
You try to urljoin(url, next_link["href"]) but next_link do not have an attribute href cause you are selecting the <li> not the <a>.
How to fix?
Option#1 - Just select the <a> in your urljoin():
url = urljoin(url, next_link.a["href"])
Option#2 - Select the <a> directly:
next_link = soup.select_one('li.pagination-next a')
Example
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
url = "https://www.last.fm/user/"+userName+"/library/artists?page=1"
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link.a["href"])
Output
['https://www.last.fm/user/brendanm1975/library/artists?page=1',
'https://www.last.fm/user/brendanm1975/library/artists?page=2',
'https://www.last.fm/user/brendanm1975/library/artists?page=3',
'https://www.last.fm/user/brendanm1975/library/artists?page=4',
'https://www.last.fm/user/brendanm1975/library/artists?page=5',
'https://www.last.fm/user/brendanm1975/library/artists?page=6',
'https://www.last.fm/user/brendanm1975/library/artists?page=7',
'https://www.last.fm/user/brendanm1975/library/artists?page=8',
'https://www.last.fm/user/brendanm1975/library/artists?page=9',
'https://www.last.fm/user/brendanm1975/library/artists?page=10',
'https://www.last.fm/user/brendanm1975/library/artists?page=11',
'https://www.last.fm/user/brendanm1975/library/artists?page=12',
'https://www.last.fm/user/brendanm1975/library/artists?page=13',
'https://www.last.fm/user/brendanm1975/library/artists?page=14',
'https://www.last.fm/user/brendanm1975/library/artists?page=15',
'https://www.last.fm/user/brendanm1975/library/artists?page=16',
'https://www.last.fm/user/brendanm1975/library/artists?page=17',
'https://www.last.fm/user/brendanm1975/library/artists?page=18',...]

python requests module find urls within the urls already found

This script returns a list of URLs found on the web page.
import requests
from bs4 import BeautifulSoup as BS
from bs4 import Comment
with requests.session() as r:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'}
r = requests.get('https://ctflearn.com', verify=False, headers=headers)
response = r.text
soup = BS(response, 'html.parser')
tags = soup.find_all('a')
for tag in tags:
links = tag.get('href')
if links[0] == '/':
appended_link = 'https://ctflearn.com' + links
print(appended_link)
elif links[0] == '#':
pass
else:
print(links)
However, what I am interested in is to visit these web pages and find the links within these pages as well. I know it is possible by using a for loop, but I don't know how to implement it.
Thanks for the help.
You could simply try to use two lists (to_visit and visited) with a check if a url is already in one of those lists before you add it to to_visit.
import requests
from bs4 import BeautifulSoup as BS
from bs4 import Comment
with requests.session() as r:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'}
to_visit = ['https://ctflearn.com']
visited = []
while (len(to_visit) > 0):
url = to_visit.pop(0)
visited.append(url)
print('visited: "{0}"'.format(url))
r = requests.get(url, verify=True, headers=headers)
response = r.text
soup = BS(response, 'html.parser')
tags = soup.find_all('a')
for tag in tags:
links = tag.get('href')
if links == None:
continue
elif links[0] == '/':
appended_link = 'https://ctflearn.com' + links
if (appended_link not in visited and appended_link not in to_visit):
to_visit.append(appended_link)
elif links[0] == '#':
pass
else:
if (links not in visited and links not in to_visit):
to_visit.append(links)
but at some point you will run into a problem because you will find and try to access something that is not an url which is why I would recommend using a validator:
import validators
validators.url(url) # returns True if "url" is a valid url

How do I search within a website using the 'requests' module?

I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?
Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()

Categories

Resources