Getting a specific value - python

hey i want to get the last href value in the navigation bar which is this part "/cat/cremes-solaires-indice-30" but i got error that the string index out of range thank you for helping me :)
from bs4 import BeautifulSoup as soup
PAGE_URL = https://www.e.leclerc/fp/sunissime-bb-fluide-protecteur-anti-age-global-spf30-40ml-3508240006457
def product():
response = requests.get(PAGE_URL, headers=HEADERS)
soupe = soup(response.content, 'html5lib')
sauce = soupe.find_all("div", class_="product-breadcrumb")
for x in sauce:
a = x.select('[href]')
print(type(a))
for i in a:
u = i.get("href")
print(u[-2])```

You can find main p tag and from it use find_all method on a tag where it returns list of tag from it extract href
from bs4 import BeautifulSoup
import requests
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
req = requests.get('https://www.e.leclerc/fp/sunissime-bb-fluide-protecteur-anti-age-global-spf30-40ml-3508240006457',headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
soup.find("p",class_="wrapper d-inline-block").find_all("a")[-1]['href']
Output:
'/cat/cremes-solaires-indice-30'

The object 'u' is not a dictionary or any other data set storing object , So every time the loop repeats the variable u is being over written . So you can either declare a list or can make the variable global .
Method 1:
from bs4 import BeautifulSoup as soup
PAGE_URL = https://www.e.leclerc/fp/sunissime-bb-fluide-protecteur-anti-age-global-spf30-40ml-3508240006457
def product():
response = requests.get(PAGE_URL, headers=HEADERS)
soupe = soup(response.content, 'html5lib')
sauce = soupe.find_all("div", class_="product-breadcrumb")
for x in sauce:
a = x.select('[href]')
print(type(a))
for i in a:
global u
u = i.get("href")
print(u)
Method 2:
from bs4 import BeautifulSoup as soup
PAGE_URL = https://www.e.leclerc/fp/sunissime-bb-fluide-protecteur-anti-age-global-spf30-40ml-3508240006457
def product():
response = requests.get(PAGE_URL, headers=HEADERS)
soupe = soup(response.content, 'html5lib')
sauce = soupe.find_all("div", class_="product-breadcrumb")
for x in sauce:
a = x.select('[href]')
u = []
print(type(a))
for i in a:
u.append(i.get("href"))
print(u[-1])
P.S : Your code is missing many thing else

Related

Using Python, BeautifulSoup, CSV to scrape a URL

In this URL https://doc8643.com/aircrafts I want to scrape all rows.
Then for each individual row, for example https://doc8643.com/aircraft/A139
I want to scrape these three areas of data
<table class="table centered-table">
<h4>Manufacturers</h4>
<h4>Technical Data</h4>
Can this is done in python?
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://doc8643.com/aircrafts'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
writer.writerow([c.text if c.text else '' for c in row.select('h3')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
else:
break
You should create function which get value c.text (ie, A139) and creates full url like https://doc8643.com/aircraft/A139 and runs Request or requests and BeautifulSoup to get all needs data
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# ... scrape details and put in list `results` ...
return results
and run it in your loop
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
The biggest problem is to scrape details.
For some details it needs to scrape dl and next all dt and dd and use zip() to group in pairs.
Something like
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
but this need more code - and I skip this part.
Minimal working code
EDIT: I added url = 'https://doc8643.com' + url
import csv
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# --- functions ---
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
# --- main ---
url = 'https://doc8643.com/aircrafts'
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["data1", "data2", "data3", "etc..."])
while True:
print('url:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
url = 'https://doc8643.com' + url
else:
break
BTW:
Maybe it would be better to keep results as dictionary
results[dt.string] = [dd.string]

How to get the value of a "hidden" href?

I'm working with web scraping to, at first, collect the total pages. I have tested the code I made for another site and however I am having a problem getting the next page link (href).
Here's the code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
page_number = 1
url = "https://www.last.fm/user/"+userName+"/library/artists?page="
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link["href"])
page_number += 1
As you can see, the href of this site presents the link as "?page=2", which does not allow me to get its content (https://www.last.fm/user/brendanm1975/library/artists?page=2).
I've already inspected the variables, and I'm getting the values.
print(url) # output: https://www.last.fm/user/brendanm1975/library/artists?page=
next_link.find('a').get('href') # output: '?page=2'
Does anyone know how to get around this?
What happens?
You try to urljoin(url, next_link["href"]) but next_link do not have an attribute href cause you are selecting the <li> not the <a>.
How to fix?
Option#1 - Just select the <a> in your urljoin():
url = urljoin(url, next_link.a["href"])
Option#2 - Select the <a> directly:
next_link = soup.select_one('li.pagination-next a')
Example
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
url = "https://www.last.fm/user/"+userName+"/library/artists?page=1"
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link.a["href"])
Output
['https://www.last.fm/user/brendanm1975/library/artists?page=1',
'https://www.last.fm/user/brendanm1975/library/artists?page=2',
'https://www.last.fm/user/brendanm1975/library/artists?page=3',
'https://www.last.fm/user/brendanm1975/library/artists?page=4',
'https://www.last.fm/user/brendanm1975/library/artists?page=5',
'https://www.last.fm/user/brendanm1975/library/artists?page=6',
'https://www.last.fm/user/brendanm1975/library/artists?page=7',
'https://www.last.fm/user/brendanm1975/library/artists?page=8',
'https://www.last.fm/user/brendanm1975/library/artists?page=9',
'https://www.last.fm/user/brendanm1975/library/artists?page=10',
'https://www.last.fm/user/brendanm1975/library/artists?page=11',
'https://www.last.fm/user/brendanm1975/library/artists?page=12',
'https://www.last.fm/user/brendanm1975/library/artists?page=13',
'https://www.last.fm/user/brendanm1975/library/artists?page=14',
'https://www.last.fm/user/brendanm1975/library/artists?page=15',
'https://www.last.fm/user/brendanm1975/library/artists?page=16',
'https://www.last.fm/user/brendanm1975/library/artists?page=17',
'https://www.last.fm/user/brendanm1975/library/artists?page=18',...]

'str' object has no attribute 'text'| BeautifulSoup

When running the following code I get an error:
'str' object has no attribute 'text'
import requests
import pandas as pd
from bs4 import BeautifulSoup
baseURL= 'https://www.olx.pl/nieruchomosci/mieszkania/sprzedaz/pomorskie/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'}
offer_links = []
for x in range (1,2):
r = requests.get(f'https://www.olx.pl/nieruchomosci/mieszkania/sprzedaz/pomorskie/?page={x}')
soup = BeautifulSoup(r.content, 'lxml')
offer_list= soup.find_all('div', class_='space rel')
for item in offer_list:
for link in item.find_all('a', href=True):
offer_links.append(link['href'])
print(offer_links)
for link in offer_links:
r= requests.get(link, headers=headers)
soup=BeautifulSoup(r.content, 'lxml')
nazwa = soup.find('h1',class_='css-1oarkq2-Text eu5v0x0').text.strip()
szczegoly = soup.find('p',class_='css-xl6fe0-Text eu5v0x0').text.strip()
cenazam2=[]
poziom=[]
umeblowane=[]
rynek=[]
zabudowa=[]
powi=[]
for i in range(0,7):
p=szczegoly[i].text.strip()
if("Cena" in p):
cenazam2.append(p)
elif("Poziom" in p or "SSD" in p):
poziom.append(p)
elif("Umeblowane" in p):
umeblowane.append(p)
elif("Rynek" in p):
rynek.append(p)
elif("zabudowy" in p):
zabudowa.append(p)
elif("Powierzchnia" in p):
powi.append(p)
oferty = {'Nazwa':nazwa,'Cena':cenazam2,'Poziom':poziom,'Umeblowane':umeblowane,'Rynek':rynek,'Zabudowa':zabudowa,'Powierzchnia':powi}
dataset = pd.DataFrame.from_dict(oferty, orient='index')
dataset = dataset.transpose()
dataset
I need something like an attachment. It's for one test link. It works for one link but I would like to automate it to paste data for the selected number of links.
szczegoly is a list of string. You assign it here:
szczegoly = soup.find('p',class_='css-xl6fe0-Text eu5v0x0').text.strip()
So you can no longer access the PageElements attributes (.text) here:
p=szczegoly[i].text.strip()
You have to decide where you want to strip the values:
szczegoly = soup.find('p',class_='css-xl6fe0-Text eu5v0x0')
...
p=szczegoly[i].text.strip()
or
szczegoly = soup.find('p',class_='css-xl6fe0-Text eu5v0x0').text.strip()
...
p=szczegoly[i]
These should work, when the element/index exists.

How to extract links from elements?

I am trying to extract the links of every individual member but I am not getting output:
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
soup = BeautifulSoup(r.text,'lxml')
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
You need request url with header param. more details
Where resultContent top doctors in Delhi-NCR result div class, cardWrap every doctor cards div class.
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Custom user agent'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
resultContentArray = soup.find('div',{'class':'resultContent'}).find_all("div",{'class':'cardWrap'})
for rr in resultContentArray:
title = rr.find('h2',{'class':'resultTitle'})
link = rr.find("a",href=True)
if link is not None:
print(link['href'])
O/P:
https://www.asklaila.com/category/Delhi-NCR/-/doctors/doctor/?category=176
https://www.asklaila.com/search/Delhi-NCR/greater-kailash-1/doctors/
https://www.asklaila.com/search/Delhi-NCR/-/maternity-hospital/
https://www.asklaila.com/Delhi-NCR/
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/
https://www.asklaila.com/search/Delhi-NCR/-/doctors/20
Use:
html.parser
custom header User-agent
soup.select feature
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('h2[class="resultTitle"] > a'):
print(link['href'])
The output:
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/sector-40/dr-amit-yadav/1ik21lZw/
Using **SoupStrainer
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
There are twenty correct links to retrieve for members. A concise way is to use css selector of parent class with child combinator to get a tag within
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers= {'User-Agent' : 'Mozilla/5.0'})
soup = BeautifulSoup(r.content,'lxml')
links = [item['href'] for item in soup.select('.resultTitle > a')]
print(links)
The server is looking for User-Agent in header to prevent users from scraping the content
you could set request headers as a work around.
from bs4 import BeautifulSoup
import requests
headers = dict()
headers['User-Agent']= "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
# with open('h.html','w') as w:
# w.write(soup.text)
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
Should give you
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/

How do I search within a website using the 'requests' module?

I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?
Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()

Categories

Resources