unable to fetch the url for next page. Throws traceback error. Basically i want to grab "/browse-movies?page=2"
from bs4 import BeautifulSoup
import requests
import re
url = "https://yts.ag/browse-movies?page=1"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('ul', 'tsc_pagination')[0]
for item in items:
print item
You could use range(1, 300) to iterate all pages:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "https://yts.ag/browse-movies?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'browse-movie-wrap')
for item in items:
for val in item.find_all('div','browse-movie-bottom'):
title = item.find_all('a','browse-movie-title')[0].text
year = item.find_all('div','browse-movie-year')[0].text
for val in item.find_all('a','browse-movie-link'):
try:
rating = val.find_all('h4')[0].text
genre = val.find_all('h4')[1].text
except:
pass
print year, rating, genre, title
P.S. You might want to add time.sleep(1) to slow down a little bit in case they block your IP for being too aggressive scraping their webpages.
Edit:
Now look for the next page URL, you could use regular expression:
import re
next_page = soup.find('a', text=re.compile(r'.*Next.*'))
print next_page['href']
So what it does is to look for an a tag which has content matches regular expression '.*Next.*'.
urls = ["https://yts.ag/browse-movies?page={}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
# your code here
print year, rating, genre, title
Make a URL list and iterate over it. You can change the range.
Related
I am trying to obtain the second tag inside a specific td but I'm not being able to obtain just the text of the second tag, as I am getting data from all the a.
Later I will do a for to obtain the data of the 10 td. As you can see in the image I want the data of the second a inside each of the 10 td:
my code:
from requests import get
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.oddsportal.com/soccer/spain/laliga'
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
match_containers = html_soup.find_all("td",{ "class" : "name table-participant"})
print(len(match_containers))
first_match = match_containers[0]
first_title = first_match.text
print (first_title)
You need to select for the second a tag
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.oddsportal.com/soccer/spain/laliga'
r = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
print([item.text for item in soup.select('#tournamentTable tr[xeid] [href*=soccer]')])
Though you can drop the table id and use:
print([item.text for item in soup.select('tr[xeid] [href*=soccer]')])
For the rows of the table, with useful match data as a list, I would use:
rows = soup.select('#tournamentTable tr[xeid]')
In my code, a user inputs a search term and the get_all_links parses the html response and extract the links that start with ‘http’. When req is replaced with a hard coded url such as:
content = urllib.request.urlopen("http://www.ox.ac.uk")
The program returns a list of properly formatted links correctly. However passing in req, no links are returned. I suspect this may be a formatting blip.
Here is my code:
import urllib.request
def get_all_links(s): # function to get all the links
d=0
links=[] # getting all links into a list
while d!=-1: # untill d is -1. i.e no links in that page
d=s.find('<a href=',d) # if <a href is found
start=s.find('"',d) # stsrt will be the next character
end=s.find('"',start+1) # end will be upto "
if d!=-1: # d is not -1
d+=1
if(s[start+1]=='h'): # add the link which starts with http only.
links.append(s[start+1:end]) # to link list
return links # return list
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
You should use a HTML parser, as suggested in the comments. A library like BeautifulSoup is perfect for this.
I have adapted your code to use BeautifulSoup
import urllib.request
from bs4 import BeautifulSoup
def get_all_links(s):
soup = BeautifulSoup(s, "html.parser")
return soup.select("a[href^=\"http\"]") # Select all anchor tags whose href attribute starts with 'http'
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
It uses the select method of the BeautifulSoup library and returns a list of selected elements (in your case anchor-tags).
Using a library like BeautifulSoup not only makes it easier, but you can also use much more complex selections. Imagine how you would have to change your code when you wanted to select all links whose href attribute contains the word "google" or "code"?
You can read the BeautifulSoup documentation here.
I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)
So I have this code that will give me the urls I need in a list format
import requests
from bs4 import BeautifulSoup
offset = 0
links = []
with requests.Session() as session:
while True:
r = session.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit?offset=%d" % offset)
soup = BeautifulSoup(r.content, "html.parser")
new_links = soup.find_all("a", {'class' : "thumb"})
# no more links - break the loop
if not new_links:
break
# denotes the number of gallery pages gone through at one time (# of pages times 24 equals the number below)
links.extend(new_links)
print(len(links))
offset += 24
#denotes the number of gallery pages(# of pages times 24 equals the number below)
if offset == 48:
break
for link in links:
print(link.get("href"))
After that I try to get different text from all of the urls, and all that text is in relatively the same place on each one. But, whenever I run the second half, below, I keep getting a chunk of html text and some errors, and I'm not sure of how to fix it or if there is any other, and preferably simpler, way to get the text from each url.
import urllib.request
import re
for link in links:
url = print("%s" % link)
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r'</a><br /><br />(.*?)</div>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span></div>', str(respData))
for eachP in paragraphs:
print(eachP)
title = re.findall(r'<title>(.*?)</title>', str(respData))
for eachT in title:
print(eachT)
Your code:
for link in links:
url = print("%s" % link)
assigns None to url. Perhaps you mean:
for link in links:
url = "%s" % link.get("href")
There's also no reason to use urllib to get the sites content, you can use requests as you did before by changing:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
to
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
Now you can get the title and paragraph with just:
title = soup.find('div', {'class': 'dev-title-container'}).h1.text
paragraph = soup.find('div', {'class': 'text block'}).text
I can scrape one site easy but the other i get error ??? Im not sure if its because the website has some sort of block on or something
import random
from bs4 import BeautifulSoup
import urllib2
import re
from urlparse import urljoin
user_input = raw_input ("Search for Team = ");
resp = urllib2.urlopen("http://idimsports.eu/football.html") ###working
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
base_url = "http://idimsports.eu"
links = soup.find_all('a', href=re.compile(''+user_input))
if len(links) == 0:
print "No Streams Available"
else:
for link in links:
print urljoin(base_url, link['href'])
resp = urllib2.urlopen("http://cricfree.tv/football-live-stream") ###not working
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
links = soup.find_all('a', href=re.compile(''+user_input))
if len(links) == 0:
print "No Streams Available"
else:
for link in links:
print urljoin(base_url, link['href'])
Set the user-agent header of your request
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request("http://cricfree.tv/football-live-stream", None, headers)
resp = urllib2.urlopen(req)
also on your second loop you're reusing base_url you probably don't want to do that.