this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'
Related
I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400
Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)
i am trying to scrape a little chunk of information from a site: fetching the data and storing it in CSV-dataset. The project: for a list of contact-data of community-services and official helpdesk in cities and village: - approx 1600 records
the basic-site: https://www.service-bw.de/web/guest/trefferliste/-/trefferliste/q-rathaus
the detail-page: Rathaus [Gemeinde Grünkraut]
https://www.service-bw.de/web/guest/organisationseinheit/-/sbw-oe/Rathaus-6000566-organisationseinheit-0
note: we have got approximatly 1600 pages.. so one of the main questions is - how to gather them into the show... how to loop over all the pages that contain the data
<div class="sp-m-organisationseinheitDetails-basisInfos-content sp-l-grid-container">
<div class="sp-l-grid-row">
<div class="sp-l-grid-col-md-6 sp-l-grid-col-sm-6 sp-l-grid-xs-col-12">
<div> <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress" class="sp-m-organisationseinheitDetails-basisInfos-addressBlock">
<h4 class="sp-m-organisationseinheitDetails-basisInfos-detailsTitle mdash">Hausanschrift</h4>
<div itemprop="streetAddress"> <span>Scherzachstr.</span> <span>2</span><br>
Desired Output:
Hausanschrift:
- name
- street & housenumber
- postal code & town
Kontaktmöglichkeiten:
- telehon
- fax
- e-mail
- internet
see in an image the chunk of information which is in every record - in each of the more than 1600 records...:
my approach:
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://www.service-bw.de/web/guest/trefferliste/-/trefferliste/q-rathaus{}"
def main(url, num):
with requests.Session() as req:
print(f"Collecting Page# {num}")
r = req.get(url.format(num))
soup = BeautifulSoup(r.content, 'html.parser')
link = [item.get("href")
for item in soup.findAll("a", rel="bookmark")]
return set(link)
with ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(main, url, num)
for num in [""]+[f"page/{x}/" for x in range(2, 50)]]
allin = []
for future in futures:
allin.extend(future.result())
soup = BeautifulSoup(r.content, 'html.parser')
target = [item.get_text(strip=True, separator=" ") for item in soup.find(
"h4", class_="sp-m-organisationseinheitDetails-basisInfos-content sp-l-grid-container").find_next("ul").findAll("dd itemprop")[:8]]
head = [soup.find("h4", class_="plugin-title").text]
new = [x for x in target if x.startswith(
("Telefon", "Fax", "E-Mail", "Internet"))]
return head + new
with ThreadPoolExecutor(max_workers=50) as executor1:
futures1 = [executor1.submit(parser, url) for url in allin]
for future in futures1:
print(future.result())
btw. perhaps we re able to get the further (additional content too) - but at the moment i try to figure out to get a basic grip of getting the pages and parsing it generally...
where i got stuck: i get errors: File "C:\Users\Kasper\Documents_f_s_j_mk__dev_\bs\bw.py", line 28 target = [item.get_text(strip=True, separator=" ") for item in soup.find( ^ IndentationError: unexpected indent [Finished in 0.32s] but besides this i suspect that the whole code is running well and fetches all the wanted items.
Perhaps you have a hint for me and give some guidance.. thx in advance.
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
from colorama import Fore, Style
req = requests.Session()
params = {
"p_p_id": "suchergebnisse_WAR_suchegui",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "nextHits",
"p_p_cacheability": "cacheLevelPage",
"p_p_col_id": "column-1",
"p_p_col_count": "1"
}
data = {
"q": ""
}
def soup(content):
soup = BeautifulSoup(content, 'html.parser')
target = [item['href']
for item in soup.select("a[id^=organisationseinheit]")]
return target
def main(url):
r = req.get(url)
urls = soup(r.content)
print("Extracting Page 1")
return urls
go = main("https://www.service-bw.de/web/guest/trefferliste/-/trefferliste/q-rathaus")
def second(url):
links = []
for page in range(1, 166):
print(f"Extracting Page {page + 1}")
r = req.get(url,
params=params, data=data)
urls = soup(r.content)
links.extend(urls)
return links
allin = go + second("https://www.service-bw.de/web/guest/trefferliste")
print(f"Catched {len(allin)} Links")
def third(url):
r = req.get(url)
parser = BeautifulSoup(r.content, 'html.parser')
print(url)
try:
part1 = [item.text for item in parser.find("div", itemprop="streetAddress").parent.findAll(
"span", text=True)]
except AttributeError:
part1 = ["N/A"]
try:
part2 = [item.text for item in parser.select(
"dd.sp-l-grid-col-md-7")[:4]]
except AttributeError:
part2 = ["N/A"]
finish = part1+part2
return finish
with ThreadPoolExecutor(max_workers=30) as executor:
futures = executor.map(third, allin)
for future in futures:
print(f"{Fore.GREEN}{future}{Style.RESET_ALL}")
I have one question.
In my code i have:
r = session.get("https://xxxxxxx.com/online/GIRL")
print (r.status_code)
print (r.cookies)
soups = BeautifulSoup(r.content, 'html5lib')
def getPeopleLinks(page):
links = []
for link in soups.find_all('a'):
url = link.get('href')
if url:
if 'profile/' in url:
links.append(url)
return links
How i can get list of all profiles on the all available pages? (ex. 1 2 3 4 5 6 etc)?
and put it to Links[]
The webcode is:
<div class="pages"><span>1</span>
2
3
4
<a accesskey="x" href="online/GIRL/2">Next</a></div>
Thanks!
ADDED:
Thanks for answer. Others pagination site have the same html as main site, so i need only read all users from all pagination (2,3,4,5 etc)
For my main site all working fine, i only need to add all users from all pagination sites to LINKS[]
login_data = {
'login': 'xxxxx',
'pass': 'xxxx',
'back_url': ''
}
def getPeopleLinks(page):
links = []
for link in soups.find_all('a'):
url = link.get('href')
if url:
if 'profil/' in url:
links.append(url)
return links
with requests.Session() as session:
url = "https://xxxxx.com/login/?form_login=1"
post = session.post(url, data=login_data, headers=headers)
print (post.status_code)
print (post.cookies)
r = session.get("https://xxxx.com/online/Girls")
print (r.status_code)
print (r.cookies)
soups = BeautifulSoup(r.content, 'html5lib')
x = getPeopleLinks(soups)
print(x)
for path in x:
sleep(3)
url = 'http://www.xxxx.com' + path
page = urllib.request.urlopen(url)
print(url)
The output is:
http://www.xxxx.com/profile/nickname
That its all working fine, but only for:
https://xxxx.com/online/Girls
I need to read all users from all pagination sites
Thanks :)
I've written a script in python to get the tabular data populated upon filling in two input boxes (From and Through) located at the top right corner of a webpage. The date I filled in to generate results are 08/28/2017 and 11/25/2018.
When I run my following script, I can get the tabular results from it's first page.
However, the data have spread across multiple pages through pagination and the url remains unchanged. How can I get the next page content?
Url to the site
This is my attempt:
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridalicense.com/FLABTBeerPricePosting/"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
try:
evtrgt = soup.select_one("#__EVENTTARGET").get('value')
except AttributeError: evtrgt = ""
viewstate = soup.select_one("#__VIEWSTATE").get('value')
viewgen = soup.select_one("#__VIEWSTATEGENERATOR").get('value')
eventval = soup.select_one("#__EVENTVALIDATION").get('value')
payload = {
'__EVENTTARGET': evtrgt,
'__EVENTARGUMENT': '',
'__VIEWSTATE':viewstate,
'__VIEWSTATEGENERATOR':viewgen,
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION':eventval,
'ctl00$MainContent$txtPermitNo':'',
'ctl00$MainContent$txtPermitName': '',
'ctl00$MainContent$txtBrandName':'',
'ctl00$MainContent$txtPeriodBeginDt':'08/28/2017',
'ctl00$MainContent$txtPeriodEndingDt':'11/25/2018',
'ctl00$MainContent$btnSearch': 'Search'
}
with requests.Session() as s:
s.headers["User-Agent"] = "Mozilla/5.0"
req = s.post(url,data=payload,cookies=res.cookies.get_dict())
sauce = BeautifulSoup(req.text,"lxml")
for items in sauce.select("#MainContent_gvBRCSummary tr"):
data = [item.get_text(strip=True) for item in items.select("th,td")]
print(data)
Any help to solve the issue will be highly appreciated. Once again: the data I wish to grab are the tabular content from the site's next pages as my script can already parse the data from it's first page?
P.S.: Browser simulator is not an option I would like to cope with.
You need to add a loop for each page and assign the requested page number to the __EVENTARGUMENT parameter as follows:
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridalicense.com/FLABTBeerPricePosting/"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
try:
evtrgt = soup.select_one("#__EVENTTARGET").get('value')
except AttributeError:
evtrgt = ""
viewstate = soup.select_one("#__VIEWSTATE").get('value')
viewgen = soup.select_one("#__VIEWSTATEGENERATOR").get('value')
eventval = soup.select_one("#__EVENTVALIDATION").get('value')
payload = {
'__EVENTTARGET' : evtrgt,
'__EVENTARGUMENT' : '',
'__VIEWSTATE' : viewstate,
'__VIEWSTATEGENERATOR' : viewgen,
'__VIEWSTATEENCRYPTED' : '',
'__EVENTVALIDATION' : eventval,
'ctl00$MainContent$txtPermitNo' : '',
'ctl00$MainContent$txtPermitName' : '',
'ctl00$MainContent$txtBrandName' : '',
'ctl00$MainContent$txtPeriodBeginDt' : '08/28/2017',
'ctl00$MainContent$txtPeriodEndingDt' : '11/25/2018',
'ctl00$MainContent$btnSearch': 'Search'
}
for page in range(1, 12):
with requests.Session() as s:
s.headers["User-Agent"] = "Mozilla/5.0"
payload['__EVENTARGUMENT'] = f'Page${page}'
req = s.post(url,data=payload,cookies=res.cookies.get_dict())
sauce = BeautifulSoup(req.text, "lxml")
for items in sauce.select("#MainContent_gvBRCSummary tr"):
data = [item.get_text(strip=True) for item in items.select("th,td")]
print(data)
I can scrape one site easy but the other i get error ??? Im not sure if its because the website has some sort of block on or something
import random
from bs4 import BeautifulSoup
import urllib2
import re
from urlparse import urljoin
user_input = raw_input ("Search for Team = ");
resp = urllib2.urlopen("http://idimsports.eu/football.html") ###working
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
base_url = "http://idimsports.eu"
links = soup.find_all('a', href=re.compile(''+user_input))
if len(links) == 0:
print "No Streams Available"
else:
for link in links:
print urljoin(base_url, link['href'])
resp = urllib2.urlopen("http://cricfree.tv/football-live-stream") ###not working
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
links = soup.find_all('a', href=re.compile(''+user_input))
if len(links) == 0:
print "No Streams Available"
else:
for link in links:
print urljoin(base_url, link['href'])
Set the user-agent header of your request
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request("http://cricfree.tv/football-live-stream", None, headers)
resp = urllib2.urlopen(req)
also on your second loop you're reusing base_url you probably don't want to do that.