Web site scraper wont scrape one of my links - python

I can scrape one site easy but the other i get error ??? Im not sure if its because the website has some sort of block on or something
import random
from bs4 import BeautifulSoup
import urllib2
import re
from urlparse import urljoin
user_input = raw_input ("Search for Team = ");
resp = urllib2.urlopen("http://idimsports.eu/football.html") ###working
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
base_url = "http://idimsports.eu"
links = soup.find_all('a', href=re.compile(''+user_input))
if len(links) == 0:
print "No Streams Available"
else:
for link in links:
print urljoin(base_url, link['href'])
resp = urllib2.urlopen("http://cricfree.tv/football-live-stream") ###not working
soup = BeautifulSoup(resp, from_encoding=resp.info().getparam('charset'))
links = soup.find_all('a', href=re.compile(''+user_input))
if len(links) == 0:
print "No Streams Available"
else:
for link in links:
print urljoin(base_url, link['href'])

Set the user-agent header of your request
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request("http://cricfree.tv/football-live-stream", None, headers)
resp = urllib2.urlopen(req)
also on your second loop you're reusing base_url you probably don't want to do that.

Related

Trying to scrape other category with beautifulsoup

this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'

why is my scraper showing 'none' as the output?

from bs4 import BeautifulSoup
import requests
url = 'https://www.flipkart.com'
data = {
'q': 'laptops'
}
html_code = requests.post(url, data=data).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')
you can try this code
import requests
from bs4 import BeautifulSoup
to_search = 'laptops'
url = 'https://www.flipkart.com/search?q=
{}&otracker=search&otracker1=search&marketplace=FLIPKART&as-
show=on&as=off'.format(to_search)
html_code = requests.get(url).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')
Can you please check again. Firstly, The url 'https://www.flipkart.com' is actually a 'GET' request and not 'POST'. Secondly, the class: _3wU53n which you have provided is actually not present in the HTML DOM. Have a look in the below screenshot for the 'GET' request and 'Absence of class: _3wU53n' in the HTML DOM.

In Python3, how can I use the .append function to add a string to scraped links?

Thanks to stackoverflow.com I was able write a program that scrapes web links from any given web page. However, I need it to concatenate the home URL to any relative link that it comes across. (Example: "http://www.google.com/sitemap" is okay. But just "/sitemap" by itself is not okay.)
In the following code,
from bs4 import BeautifulSoup as mySoup
from urllib.parse import urljoin as myJoin
from urllib.request import urlopen as myRequest
base_url = "https://www.census.gov/programs-surveys/popest.html"
html_page = myRequest(base_url)
raw_html = html_page.read()
page_soup = mySoup(raw_html, "html.parser")
html_page.close()
f = open("census4-3.csv", "w")
all_links = page_soup.find_all('a', href=True)
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
full_url = myJoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, base_url)
for link in cleaned_links:
f.write(str(link) + '\n')
f.close()
print("The CSV file is saved to your computer.")
how and where would I add something like this:
.append("http://www.google.com")
You should save your base url as base_url = 'https://www.census.gov'.
Call the requests like this
html_page = myRequest(base_url + '/programs-surveys/popest.html')
When you want to get any full_url, just do this
full_url = base_url + link

Convert a result in the console into a list Python

I want to web scrape a list of urls from a web site and then open them one by one.
I can get the list of all urls but then try to turn into a list things get wrong.
When I print the list, intead of geting [url1, urls2...] I get something like this in the console:
[url1,url2,url3] dif line
[url1,url2,url3,url4] difline
[url1,url2,url3,url4,url5]
Find the my script bellow:
driver = webdriver.Chrome()
my_url="https://prog.nfz.gov.pl/app-jgp/AnalizaPrzekrojowa.aspx"
driver.get(my_url)
time.sleep(3)
content = driver.page_source.encode('utf-8').strip()
page_soup = soup(content,"html.parser")
links =[]
for link in page_soup.find_all('a', href=True):
url=link['href']
ai=str(url)
links.append(ai)
print(links)
links.append(ai)
print(links)
I have rewritten your code a little. First you need to load and scrap main page to get all links from "href". After that just use scraped urls in a loop to get next pages.
Also there is some junk in "href" which isn't url so you have to clean it first.
I prefer requests to do GET.
http://docs.python-requests.org/en/master/
I hope it helps.
from bs4 import BeautifulSoup
import requests
def main():
links = []
url = "https://prog.nfz.gov.pl/app-jgp/AnalizaPrzekrojowa.aspx"
web_page = requests.get(url)
soup = BeautifulSoup(web_page.content, "html.parser")
a_tags = soup.find_all('a', href=True)
for a in a_tags:
links.append(a.get("href"))
print(links) # just to demonstrate that links are there
cleaned_list = []
for link in links:
if "http" in link:
cleaned_list.append(link)
print(cleaned_list)
return cleaned_list
def load_pages_from_links(urls):
user_agent = {'User-agent': 'Mozilla/5.0'}
links = urls
downloaded_pages = {}
if len(links) == 0:
return "There are no links."
else:
for nr, link in enumerate(links):
web_page = requests.get(link, headers=user_agent)
downloaded_pages[nr] = web_page.content
print(downloaded_pages)
if __name__ == "__main__":
links = main()
load_pages_from_links(links)

How to grab next page url from pagination.

unable to fetch the url for next page. Throws traceback error. Basically i want to grab "/browse-movies?page=2"
from bs4 import BeautifulSoup
import requests
import re
url = "https://yts.ag/browse-movies?page=1"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('ul', 'tsc_pagination')[0]
for item in items:
print item
You could use range(1, 300) to iterate all pages:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "https://yts.ag/browse-movies?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'browse-movie-wrap')
for item in items:
for val in item.find_all('div','browse-movie-bottom'):
title = item.find_all('a','browse-movie-title')[0].text
year = item.find_all('div','browse-movie-year')[0].text
for val in item.find_all('a','browse-movie-link'):
try:
rating = val.find_all('h4')[0].text
genre = val.find_all('h4')[1].text
except:
pass
print year, rating, genre, title
P.S. You might want to add time.sleep(1) to slow down a little bit in case they block your IP for being too aggressive scraping their webpages.
Edit:
Now look for the next page URL, you could use regular expression:
import re
next_page = soup.find('a', text=re.compile(r'.*Next.*'))
print next_page['href']
So what it does is to look for an a tag which has content matches regular expression '.*Next.*'.
urls = ["https://yts.ag/browse-movies?page={}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
# your code here
print year, rating, genre, title
Make a URL list and iterate over it. You can change the range.

Categories

Resources