I've written a script in python to get the tabular data populated upon filling in two input boxes (From and Through) located at the top right corner of a webpage. The date I filled in to generate results are 08/28/2017 and 11/25/2018.
When I run my following script, I can get the tabular results from it's first page.
However, the data have spread across multiple pages through pagination and the url remains unchanged. How can I get the next page content?
Url to the site
This is my attempt:
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridalicense.com/FLABTBeerPricePosting/"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
try:
evtrgt = soup.select_one("#__EVENTTARGET").get('value')
except AttributeError: evtrgt = ""
viewstate = soup.select_one("#__VIEWSTATE").get('value')
viewgen = soup.select_one("#__VIEWSTATEGENERATOR").get('value')
eventval = soup.select_one("#__EVENTVALIDATION").get('value')
payload = {
'__EVENTTARGET': evtrgt,
'__EVENTARGUMENT': '',
'__VIEWSTATE':viewstate,
'__VIEWSTATEGENERATOR':viewgen,
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION':eventval,
'ctl00$MainContent$txtPermitNo':'',
'ctl00$MainContent$txtPermitName': '',
'ctl00$MainContent$txtBrandName':'',
'ctl00$MainContent$txtPeriodBeginDt':'08/28/2017',
'ctl00$MainContent$txtPeriodEndingDt':'11/25/2018',
'ctl00$MainContent$btnSearch': 'Search'
}
with requests.Session() as s:
s.headers["User-Agent"] = "Mozilla/5.0"
req = s.post(url,data=payload,cookies=res.cookies.get_dict())
sauce = BeautifulSoup(req.text,"lxml")
for items in sauce.select("#MainContent_gvBRCSummary tr"):
data = [item.get_text(strip=True) for item in items.select("th,td")]
print(data)
Any help to solve the issue will be highly appreciated. Once again: the data I wish to grab are the tabular content from the site's next pages as my script can already parse the data from it's first page?
P.S.: Browser simulator is not an option I would like to cope with.
You need to add a loop for each page and assign the requested page number to the __EVENTARGUMENT parameter as follows:
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridalicense.com/FLABTBeerPricePosting/"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
try:
evtrgt = soup.select_one("#__EVENTTARGET").get('value')
except AttributeError:
evtrgt = ""
viewstate = soup.select_one("#__VIEWSTATE").get('value')
viewgen = soup.select_one("#__VIEWSTATEGENERATOR").get('value')
eventval = soup.select_one("#__EVENTVALIDATION").get('value')
payload = {
'__EVENTTARGET' : evtrgt,
'__EVENTARGUMENT' : '',
'__VIEWSTATE' : viewstate,
'__VIEWSTATEGENERATOR' : viewgen,
'__VIEWSTATEENCRYPTED' : '',
'__EVENTVALIDATION' : eventval,
'ctl00$MainContent$txtPermitNo' : '',
'ctl00$MainContent$txtPermitName' : '',
'ctl00$MainContent$txtBrandName' : '',
'ctl00$MainContent$txtPeriodBeginDt' : '08/28/2017',
'ctl00$MainContent$txtPeriodEndingDt' : '11/25/2018',
'ctl00$MainContent$btnSearch': 'Search'
}
for page in range(1, 12):
with requests.Session() as s:
s.headers["User-Agent"] = "Mozilla/5.0"
payload['__EVENTARGUMENT'] = f'Page${page}'
req = s.post(url,data=payload,cookies=res.cookies.get_dict())
sauce = BeautifulSoup(req.text, "lxml")
for items in sauce.select("#MainContent_gvBRCSummary tr"):
data = [item.get_text(strip=True) for item in items.select("th,td")]
print(data)
Related
this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'
*** My code is for practice only!
I'm trying to scrape the names and teams that each player in FPL from their website https://www.premierleague.com/ and I got some problems with the code.
The problem is it's only getting the page with the '-1' in the end of the url, wihch I haven't even inculded in my pages list!
there isn't any logic with the pages - the basic url is https://www.premierleague.com/players?se=363&cl= while the number after the '=' seems to be random. so I created a list of the numbers and added it to the url with a for loop:
my code:
import requests
from bs4 import BeautifulSoup
import pandas
plplayers = []
pl_url = 'https://www.premierleague.com/players?se=363&cl='
pages_list = ['1', '2', '131', '34']
for page in pages_list:
r = requests.get(pl_url + page)
c = r.content
soup = BeautifulSoup(c, 'html.parser')
player_names = soup.find_all('a', {'class': 'playerName'})
for x in player_names:
player_d = {}
player_teams = []
player_href = x.get('href')
player_info_url = 'https://www.premierleague.com/' + player_href
player_r = requests.get(player_info_url, headers=headers)
player_c = player_r.content
player_soup = BeautifulSoup(player_c, 'html.parser')
team_tag = player_soup.find_all('td', {'class': 'team'})
for team in team_tag:
try:
team_name = team.find('span', {'class': 'long'}).text
if '(Loan)' in team_name:
team_name.replace(' (Loan) ', '')
if team_name not in player_teams:
player_teams.append(team_name)
player_d['NAME'] = x.text
player_d['TEAMS'] = player_teams
except:
pass
plplayers.append(player_d)
df = pandas.DataFrame(plplayers)
df.to_csv('plplayers.txt')
I would comment this but I'm new and don't have enough reputation this so I'll have to keep it in an answer.
It looks like when you made a request to store in player_r you specified a headers parameter but didn't actually make a headers variable.
If you replace player_r = requests.get(player_info_url, headers=headers)with player_r = requests.get(player_info_url) instead, your code should run perfectly. At least, it did on my machine.
In my code, a user inputs a search term and the get_all_links parses the html response and extract the links that start with ‘http’. When req is replaced with a hard coded url such as:
content = urllib.request.urlopen("http://www.ox.ac.uk")
The program returns a list of properly formatted links correctly. However passing in req, no links are returned. I suspect this may be a formatting blip.
Here is my code:
import urllib.request
def get_all_links(s): # function to get all the links
d=0
links=[] # getting all links into a list
while d!=-1: # untill d is -1. i.e no links in that page
d=s.find('<a href=',d) # if <a href is found
start=s.find('"',d) # stsrt will be the next character
end=s.find('"',start+1) # end will be upto "
if d!=-1: # d is not -1
d+=1
if(s[start+1]=='h'): # add the link which starts with http only.
links.append(s[start+1:end]) # to link list
return links # return list
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
You should use a HTML parser, as suggested in the comments. A library like BeautifulSoup is perfect for this.
I have adapted your code to use BeautifulSoup
import urllib.request
from bs4 import BeautifulSoup
def get_all_links(s):
soup = BeautifulSoup(s, "html.parser")
return soup.select("a[href^=\"http\"]") # Select all anchor tags whose href attribute starts with 'http'
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
It uses the select method of the BeautifulSoup library and returns a list of selected elements (in your case anchor-tags).
Using a library like BeautifulSoup not only makes it easier, but you can also use much more complex selections. Imagine how you would have to change your code when you wanted to select all links whose href attribute contains the word "google" or "code"?
You can read the BeautifulSoup documentation here.
I've written a script in python to get data from a webpage. The site displays it's content across 60 pages. My scraper can parse data from it's second page. When I try to change the page number in payload parameter or create a loop to get data from few of the pages, it instantly breaks. How can I rectify my script in the such a way so that it can fetch data from all of the pages, not only from the second page. Thanks in advance.
Link to reach the site with data: Page_link
Link to replace with the below script: page_url
I suppose, the pagination number lies in here:
ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$ddlPages:1
Here is the full script (working only for page 2):
import requests
from bs4 import BeautifulSoup
url = "Link to replace with the above url" ##Replace the number 2 links here
formdata = {
'searchEntity':'FundServiceProvider',
'searchType':'Name',
'searchText':'',
'registers':'6,29,44,45',
'AspxAutoDetectCookieSupport':'1'
}
req = requests.get(url,params=formdata,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(req.text,"lxml")
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
payload = {
'__EVENTTARGET':'','__EVENTARGUMENT':'','__LASTFOCUS':'','__VIEWSTATE':VIEWSTATE,'__SCROLLPOSITIONX':'0','__SCROLLPOSITIONY':'541','__EVENTVALIDATION':EVENTVALIDATION,'ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$ddlPages':1,'ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$btnNext.x':'260','ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$btnNext.y':'11'
}
with requests.session() as session:
session.headers = {"User-Agent":"Mozilla/5.0"}
response = session.post(req.url,data=payload)
soup = BeautifulSoup(response.text,"lxml")
tabd = soup.select(".searchresults")[0]
for items in tabd.select("tr")[:-1]:
data = ' '.join([item.text for item in items.select("th,td")])
print(data)
You just need to remove the last 2 fields of payload data :
payload = {
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':VIEWSTATE,
'__SCROLLPOSITIONX':'0',
'__SCROLLPOSITIONY':'541',
'__EVENTVALIDATION':EVENTVALIDATION,
'ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$ddlPages':1
}
instead of
payload = {
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':VIEWSTATE,
'__SCROLLPOSITIONX':'0',
'__SCROLLPOSITIONY':'541',
'__EVENTVALIDATION':EVENTVALIDATION,
'ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$ddlPages':1,
'ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$btnNext.x':'260',
'ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$btnNext.y':'11'
}
And then updating ctl00$cphRegistersMasterPage$gvwSearchResults$ctl18$ddlPages value will get the correct page data
Trying to get multiple headlines, links and dates. Only getting the first one. Not sure why BS4 won't fetch all the items...Is it a javascript problem?
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen("http://www.fiercepharma.com/news")
soup = BeautifulSoup(html.read().decode('utf-8'),"lxml")
main_div = soup.select_one("div#content")
div_sub = main_div.select("div.region.region-content")
for d in div_sub:
date = d.time.get_text()
headline = d.h2.a.get_text()
url = d.a["href"]
print headline, url, date
What about using the following to capture all the articles that contain links, authors, posting dates on the main page. You could store this in a dictionary, or store it in a pandas dataframe for easy manipulation.
from bs4 import BeautifulSoup
import requests
baseurl = 'http://www.fiercepharma.com'
response = requests.get(baseurl)
soup = BeautifulSoup(response.content)
cdict = {}
for group in soup.find_all('div', {'class' : 'card horizontal views-row'}):
try:
title = group.find('h2', {'class' : 'field-content list-title'}).text
link = baseurl + group.find('h2', {'class' : 'field-content list-title'}).find('a', href=True)['href']
author = group.find('span', {'class' : 'field-content'}).find('a').text
time = group.find('span', {'class' : 'field-content'}).find('time').text
content = group.find('p', {'class' : 'field-content card-text'}).text
cdict[link] = {'title' : title, 'author' : author, 'time' : time, 'content' : content}
except AttributeError as e:
print('[-] Unable to parse {}'.format(e))
print(cdict)
#{'http://www.fiercepharma.com/manufacturing/lonza-bulks-up-5-5b-deal-for-capsugel': {'author': u'Eric Palmer',
# 'content': u'Swiss CDMO Lonza has pulled the trigger on a $5.5 billion deal to acquire the U.S.-based contract capsule and drug producer Capsugel to create another sizable\u2026',
# 'time': u'Dec 15, 2016 8:45am',
# 'title': u'Lonza bulks up with $5.5B deal for Capsugel'},
Both div.card.horizontal.views-row and .card.horizontal.views-row should work #citra_amarillo. I ran this and it work bothways
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen("http://www.fiercepharma.com/news")
soup = BeautifulSoup(html.read().decode('utf-8'),"lxml")
main_div = soup.select_one("div#content")
div_sub = main_div.select(".card.horizontal.views-row")
#div_sub = main_div.select("div.card.horizontal.views-row")
for d in div_sub:
date = d.time.get_text()
headline = d.h2.a.get_text()
url = d.a["href"]
print headline, url, date