Python - Pagination loop - python

I have one question.
In my code i have:
r = session.get("https://xxxxxxx.com/online/GIRL")
print (r.status_code)
print (r.cookies)
soups = BeautifulSoup(r.content, 'html5lib')
def getPeopleLinks(page):
links = []
for link in soups.find_all('a'):
url = link.get('href')
if url:
if 'profile/' in url:
links.append(url)
return links
How i can get list of all profiles on the all available pages? (ex. 1 2 3 4 5 6 etc)?
and put it to Links[]
The webcode is:
<div class="pages"><span>1</span>
2
3
4
<a accesskey="x" href="online/GIRL/2">Next</a></div>
Thanks!
ADDED:
Thanks for answer. Others pagination site have the same html as main site, so i need only read all users from all pagination (2,3,4,5 etc)
For my main site all working fine, i only need to add all users from all pagination sites to LINKS[]
login_data = {
'login': 'xxxxx',
'pass': 'xxxx',
'back_url': ''
}
def getPeopleLinks(page):
links = []
for link in soups.find_all('a'):
url = link.get('href')
if url:
if 'profil/' in url:
links.append(url)
return links
with requests.Session() as session:
url = "https://xxxxx.com/login/?form_login=1"
post = session.post(url, data=login_data, headers=headers)
print (post.status_code)
print (post.cookies)
r = session.get("https://xxxx.com/online/Girls")
print (r.status_code)
print (r.cookies)
soups = BeautifulSoup(r.content, 'html5lib')
x = getPeopleLinks(soups)
print(x)
for path in x:
sleep(3)
url = 'http://www.xxxx.com' + path
page = urllib.request.urlopen(url)
print(url)
The output is:
http://www.xxxx.com/profile/nickname
That its all working fine, but only for:
https://xxxx.com/online/Girls
I need to read all users from all pagination sites
Thanks :)

Related

Trying to scrape other category with beautifulsoup

this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'

How to scrape multiple pages of a site using paging using BeautifulSoup and requests?

I created a scraper using BeautifulSoup and requests that scrapes the search results of the site Ask.com based on the keywords entered by the user. For now this scraper is limited to only one page of scraped search results. Here is the basic code of my scraper:
def search(request):
if request.method == 'POST':
search = request.POST['search']
url = 'https://www.ask.com/web?q='+search
res = requests.get(url)
soup = bs(res.text, 'lxml')
result_listings = soup.find_all('div', {'class': 'PartialSearchResults-item'})
final_result = []
for result in result_listings:
result_title = result.find(class_='PartialSearchResults-item-title').text
result_url = result.find('a').get('href')
result_desc = result.find(class_='PartialSearchResults-item-abstract').text
final_result.append((result_title, result_url, result_desc))
context = {
'final_result': final_result
}
And I would like to make sure that BeautifulSoup can scrape the other 5 pages of search results by following the pagination, I modified my code like this:
def search(request):
if request.method == 'POST':
search = request.POST['search']
url = 'https://www.ask.com/web?q='+search
res = requests.get(url)
soup = bs(res.text, 'lxml')
result_listings = soup.find_all('div', {'class': 'PartialSearchResults-item'})
final_result = []
for result in result_listings:
while True:
result_title = result.find(class_='PartialSearchResults-item-title').text
result_url = result.find('a').get('href')
result_desc = result.find(class_='PartialSearchResults-item-abstract').text
result_nextpage = result.find('a').get('PartialWebPagination-next')
if result_nextpage.find_all('div', {'class': 'PartialSearchResults-item'}):
url = 'https://www.ask.com/web?q='+ search + result.find('a').get('PartialWebPagination-next')
return url
else :
final_result.append((result_title, result_url, result_desc))
context = {
'final_result': final_result
}
After when I run python manage.py runserver in order to run my server and when I enter the keywords to search in the appropriate search bar, instead of sending me the scraping results the page keeps loading without stopping. I therefore ask for help from more experienced members of the community because I do not know where my error lies. inspired by this question I modified the url variable as well:
url = "https://www.ask.com/search?q=" + search+ "&start=" + str((page - 1) * 5)
and when I executed, I obtained the following error name 'page' is not defined . So I ask for the help of the community. Thank you.
If your page is working for single page then with a little change it will work on next pages also. Just try to change page number in the url as ask.com supports it.
def search(request):
if request.method == 'POST':
search = request.POST['search']
max_pages_to_scrap = 5
final_result = []
for page_num in range(1, max_pages_to_scrap+1):
url = "https://www.ask.com/web?q=" + search + "&qo=pagination&page=" + str(page_num)
res = requests.get(url)
soup = bs(res.text, 'lxml')
result_listings = soup.find_all('div', {'class': 'PartialSearchResults-item'})
for result in result_listings:
result_title = result.find(class_='PartialSearchResults-item-title').text
result_url = result.find('a').get('href')
result_desc = result.find(class_='PartialSearchResults-item-abstract').text
final_result.append((result_title, result_url, result_desc))
context = {'final_result': final_result}

how to get the base string and page no string in for loop?

currently i am putting the full url in urlist i want the only string after pageno in the urlist and the program should go on rest as it as.
https://bidplus.gem.gov.in/bidlists?bidlists&page_no=**AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI**
urlList = ["https://bidplus.gem.gov.in/bidlists?bidlists&page_no=AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI",
"https://bidplus.gem.gov.in/bidlists?bidlists&page_no=Hgw0LYpSZdLXow1Wq84uKar1nxXbFhClXQDuAAiPDxU",
"https://bidplus.gem.gov.in/bidlists?bidlists&page_no=rO5Erb90Q_P1S0fL5O6FEShlv20RBXmkHFusZogvUoo",
"https://bidplus.gem.gov.in/bidlists?bidlists&page_no=jiE0kS8e-ghmlmjDMPUJm1OBCRotqJ6n7srXZN99LZc",
"https://bidplus.gem.gov.in/bidlists?bidlists&page_no=MY89EG2RtzpSMlT1wjE61Cv31nAyetQ49kmXfw2AfMo",
]
for url in urlList:
print('Hold on creating URL to fetch data...')
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
print('URL created: ' + url)
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')
extracted_data = sou
p_data.find('div', {'id': 'pagi_content'})
Use this line after your urlList variable:
urlList = [x.split('=')[-1] for x in urlList]
you can split the urls on = and get the part you need:
for url in urls:
print(url.split("=")[-1])
outputs:
AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI
Hgw0LYpSZdLXow1Wq84uKar1nxXbFhClXQDuAAiPDxU
rO5Erb90Q_P1S0fL5O6FEShlv20RBXmkHFusZogvUoo
jiE0kS8e-ghmlmjDMPUJm1OBCRotqJ6n7srXZN99LZc
MY89EG2RtzpSMlT1wjE61Cv31nAyetQ49kmXfw2AfMo
if you want the page number in its own list this is how:
pagenumbers = [i.split("=")[-1] for i in urls]
>>> pagenumbers
['AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI', 'Hgw0LYpSZdLXow1Wq84uKar1nxXbFhClXQDuAAiPDxU', 'rO5Erb90Q_P1S0fL5O6FEShlv20RBXmkHFusZogvUoo', 'jiE0kS8e-ghmlmjDMPUJm1OBCRotqJ6n7srXZN99LZc', 'MY89EG2RtzpSMlT1wjE61Cv31nAyetQ49kmXfw2AfMo']
there is no need to split the urls.
In your for loop you can just use the url directy since you are iterating over the full url.
for url in urlList:
print('Hold on fetching data...')
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')

Create a specific Web Scraper

I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400
Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)

Web Page Scraping with BeautifulSoup

I am new and I am trying to the links of each products for this web page all subpages (1-8): https://www.sodimac.cl/sodimac-cl/category/scat359268/Esmaltes-al-agua
I have a loop to go over each page but for some reason on page 7 it only brings 20 products and no products on page 8
This is the function that gets me all the URL for each product on each page:
def get_all_product_url(base_url):
# Set up link and gets all URLs
page = requests.get(base_url, stream=True)
soup = BeautifulSoup(page.content, 'html.parser',from_encoding='utf-8')
url_list = []
try:
products = soup.find_all('div', {'class':'jsx-3418419141 product-thumbnail'})
except:
return url_list
for i in products:
url = i.find("a").get('href')
if 'https://www.sodimac.cl' in url:
url_list.append(url)
else:
url_list.append('https://www.sodimac.cl'+url)
# Return all web address without duplicates
return list(set(url_list))
When I run it for page 8 I get an emply list
base_url = "https://www.sodimac.cl/sodimac-cl/category/scat359268/Esmaltes-al-agua?currentpage=8"
page = requests.get(base_url, stream=True)
soup = BeautifulSoup(page.content, 'html.parser',from_encoding='utf-8')
url_list = get_all_product_url(base_url)
url_list
If you run it for page 1, you will get 28 entries
base_url = "https://www.sodimac.cl/sodimac-cl/category/scat359268/Esmaltes-al-agua?currentpage=1"
page = requests.get(base_url, stream=True)
soup = BeautifulSoup(page.content, 'html.parser',from_encoding='utf-8')
url_list = get_all_product_url(base_url)
url_list
Any help I reall appreciate it.
Thanks

Categories

Resources