How should I scrape these images without errors?

How should I scrape these images without errors? - python

I'm trying to scrape the images (or the images link) of this forum (http://www.xossip.com/showthread.php?t=1384077) . I've tried beautiful soup 4 and here is the code I tried:
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.xossip.com/showthread.php?t=1384077&page=' + str(page)
sourcecode= requests.get(url)
plaintext = sourcecode.text
soup = BeautifulSoup(plaintext)
for link in soup.findAll('a',{'class': 'alt1'}):
src = link.get('src')
print(src)
page += 1
spider(1)
How should I correct it so that I get links of images like pzy.be/example ?

Okay, so I did this by getting all of the #post_message_* divs and then getting the images from each of those.
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.xossip.com/showthread.php?t=1384077&page=' + str(page)
sourcecode= requests.get(url)
plaintext = sourcecode.text
soup = BeautifulSoup(plaintext)
divs = soup.findAll('div', id=lambda d: d and d.startswith('post_message_'))
for div in divs:
src = div.find('img')['src']
if src.startswith('http'): # b/c it could be a smilie or something like that
print(src)
page += 1
spider(1)

The simplest way is to just request each page and filter the img tags:
from bs4 import BeautifulSoup
from requests import get
import re
def get_wp():
start_url = "http://www.xossip.com/showthread.php?t=1384077&page={}"
for i in range(73):
r = get(start_url.format(i))
soup = BeautifulSoup(r.content)
for img in (i["src"] for i in soup.find_all("img", src=re.compile("http://pzy.be.*.jpg"))):
yield img

Related

Web Image Scraping not giving any output

I am trying to scrape all the photos in the URL below, but this code doesn't give any output, why?
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.airbnb.co.uk/s/Ljubljana--Slovenia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Ljubljana%2C%20Slovenia&place_id=ChIJ0YaYlvUxZUcRIOw_ghz4AAQ&checkin=2020-11-01&checkout=2020-11-08&source=structured_search_input_header&search_type=autocomplete_click'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.find_all('img')
print(images)
for image in images:
name = image['alt']
link = image['src']
print(name, link)

from bs4 import BeautifulSoup
import requests
# Replace this with the website's URL
URL = "put your URL here"
getURL = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
print(getURL.status_code)
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
webs = requests.get(image)
open('images/' + image.split('/')[-1], 'wb').write(webs.content)

Scrape url list from Reelgood.com

Hi Im trying to build a scraper (in Python) for the website ReelGood.com.
now I got this topic to and I figured out how to scrape the url from the movie page. but what I can't seem t figure out why this script won't work:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import requests
URL = "https://reelgood.com/movies/source/netflix"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
f = open("C:/Downloaders/test/Scrape/test1.txt", "w")
for link in soup.select('a[href*="https://reelgood.com/movie/"]'):
data = link.get('href')
f.write(data)
f.write("\n")
f.close()
EDIT
The end goal is to export a txt file containing all links to the movie page on reelgood.com.
UPDATE: 1
if I change the script to this:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import requests
URL = "https://reelgood.com/movies/source/netflix"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
f = open("C:/Downloaders/test/Scrape/test1.txt", "w")
for link in soup.find_all("a", href=True):
data = link.get('href')
f.write(data)
f.write("\n")
f.close()
i do get the following output in test1.txt
/ /tv /tv/curated/trending-picks /tv/browse/new-tv-on-your-sources /tv/roulette/netflix /tv?filter-availability=onSources /tv/source/free /tv/source/netflix /tv/source/amazon /tv/source/hulu /tv/curated/2019-emmy-nominees /tv/genre/action-and-adventure /tv/genre/animation /tv/genre/anime /tv/genre/comedy /tv/genre/crime /tv/genre/documentary /tv/genre/drama /tv/genre/food /tv/genre/family /tv/genre/fantasy /tv/curated/imdbs-best-rated-tv /tv/genre/lgbtq /tv/genre/mystery /tv/genre/reality /tv/genre/science-fiction /tv /movies /movies/browse/popular-movies /movies/browse/recent-movies-on-your-sources /movies/roulette/netflix /movies?filter-availability=onSources /movies/source/free /movies/source/netflix /movies/source/amazon /movies/source/hulu /movies/genre/action-and-adventure /movies/genre/animation /movies/genre/comedy /movies/genre/crime /movies/genre/documentary /movies/genre/drama /movies/genre/family /movies/genre/fantasy /movies/genre/horror /movies/curated/imdbs-best-rated-movies /movies/genre/lgbtq /movies/genre/mystery /movies/genre/romance /movies/genre/science-fiction /movies/genre/thriller /movies /new /new?availability=onSources /new/netflix /new/amazon /new/hulu /new/hbo /new/disney_plus /coming?availability=onSources /coming/netflix /coming/amazon /coming/hulu /coming/hbo /coming/disney_plus /leaving?availability=onSources /leaving/netflix /leaving/amazon /leaving/hulu /leaving/hbo /new /login /login /signup /movies/source/netflix /movies/genre/action-and-adventure/on-netflix /movies/genre/animation/on-netflix /movies/genre/anime/on-netflix /movies/genre/biography/on-netflix /movies/genre/children/on-netflix /movies/genre/comedy/on-netflix /movies/genre/crime/on-netflix /movies/genre/cult/on-netflix /movies/genre/documentary/on-netflix /movies/genre/drama/on-netflix /movies/genre/family/on-netflix /movies/genre/fantasy/on-netflix /movies/genre/food/on-netflix /movies/genre/history/on-netflix /movies/genre/horror/on-netflix /movies/genre/lgbtq/on-netflix /movies/genre/musical/on-netflix /movies/genre/mystery/on-netflix /movies/genre/romance/on-netflix /movies/genre/science-fiction/on-netflix /movies/genre/sport/on-netflix /movies/genre/stand-up-and-talk/on-netflix /movies/genre/thriller/on-netflix /movies/list/africa/on-netflix /movies/list/adaptation/on-netflix /movies/list/alien/on-netflix /movies/list/animal/on-netflix /movies/list/apocalypse/on-netflix /movies/list/baseball/on-netflix /movies/list/based-on-novel/on-netflix /movies/list/true-story/on-netflix /movies/list/bollywood/on-netflix /movies/list/boxing/on-netflix /movies/list/british-humour/on-netflix /movies/list/car/on-netflix /movies/list/cartoon/on-netflix /movies/list/christmas/on-netflix /movies/list/classic/on-netflix /movies/list/college/on-netflix /movies/list/based-on-comic/on-netflix /movies/list/coming-of-age/on-netflix /movies/list/dance/on-netflix /movies/list/dark-comedy/on-netflix /movies/list/dating/on-netflix /movies/list/disaster/on-netflix /movies/list/disney/on-netflix /movies/list/doctor/on-netflix /movies/list/dog/on-netflix /movies/list/drug/on-netflix /movies/list/dystopia/on-netflix /movies/list/egypt/on-netflix /movies/list/escape/on-netflix /movies/list/fashion/on-netflix /movies/list/feel-good/on-netflix /movies/list/woman-director/on-netflix /movies/list/fighting/on-netflix /movies/list/friendship/on-netflix /movies/list/futuristic/on-netflix /movies/list/gang/on-netflix /movies/list/gangster/on-netflix /movies/list/genius/on-netflix /movies/list/ghost/on-netflix /movies/list/golf/on-netflix /movies/list/gymnast/on-netflix /movies/list/heroism/on-netflix /movies/list/high-school/on-netflix /movies/list/holiday/on-netflix /movies/list/hunting/on-netflix /movies/list/imagination/on-netflix /movies/list/jungle/on-netflix /movies/list/kidnapping/on-netflix /movies/list/magic/on-netflix /movies/list/martial-arts/on-netflix /movies/list/mature/on-netflix /movies/list/medieval/on-netflix /movies/list/military/on-netflix /movies/list/monster/on-netflix /movies/list/music/on-netflix /movies/list/new-york/on-netflix /movies/list/paris/on-netflix /movies/list/parody/on-netflix /movies/list/pet/on-netflix /movies/list/police/on-netflix /movies/list/political/on-netflix /movies/list/princess/on-netflix /movies/list/prison/on-netflix /movies/list/psychology/on-netflix /movies/list/racing/on-netflix /movies/list/religion/on-netflix /movies/list/revenge/on-netflix /movies/list/river/on-netflix /movies/list/robot/on-netflix /movies/list/rome/on-netflix /movies/list/royalty/on-netflix /movies/list/science/on-netflix /movies/list/serial-killer/on-netflix /movies/list/short/on-netflix /movies/list/singing/on-netflix /movies/list/space/on-netflix /movies/list/sports/on-netflix /movies/list/spy/on-netflix /movies/list/superhero/on-netflix /movies/list/supernatural/on-netflix /movies/list/survival/on-netflix /movies/list/suspense/on-netflix /movies/list/tank/on-netflix /movies/list/teacher/on-netflix /movies/list/technology/on-netflix /movies/list/teen/on-netflix /movies/list/time-travel/on-netflix /movies/list/toy/on-netflix /movies/list/twins/on-netflix /movies/list/vampire/on-netflix /movies/list/games/on-netflix /movies/list/war/on-netflix /movies/list/world-war-ii/on-netflix /movies/list/wrestling/on-netflix /movies/list/zombie/on-netflix /source/netflix /movies/source/netflix /tv/source/netflix /movies /movies/source/free /movies /movies/source/netflix /movies/source/amazon /movies/source/hbo_max /movies/source/hbo /movies/source/showtime /movies/source/hulu /movies/source/fx_tveverywhere /movies/source/starz /movies/source/apple_tv_plus /movies/source/plex_free /movies/source/disney_plus /movies/source/peacock /movies/source/philo /movies/source/fubo_tv /movies/source/epix /movies/source/crunchyroll_premium /movies/source/dc_universe /movies/source/mubi /movies/source/discovery_plus /movies/source/amc_premiere /movies/source/amc /movies/source/britbox /movies/source/ifc /movies/source/youtube_premium /movies/source/shudder /movies/source/criterion_channel /movies/source/funimation /movies/source/fandor /movies/source/hoopla /movies/source/kanopy /movies/source/tubi_tv /movies/source/plutotv /movies/source/peacock_free /movies/source/vudu_free /movies/source/imdb_tv /movies/source/popcornflix /movies/source/crunchyroll_free /movies/source/crackle /movies/source/acorntv /movies/source/cinemax /movies/source/hallmark_movies_now /movies/source/sundance_tveverywhere /movies/source/syfy_tveverywhere /movies/source/tbs /movies/source/tnt /movies/source/bet_plus /movies/source/watch_tcm /movies/source/comedycentral_tveverywhere /movies/source/hallmark_everywhere /movies/source/lifetime_tveverywhere /movies/source/disneynow /movies/source/paramount_plus /movies/source/tvision /movie/the-intouchables-2011 /movie/the-intouchables-2011 /movie/the-irishman-2018 /movie/the-irishman-2018 /movie/marriage-story-2019 /movie/marriage-story-2019 /movie/dangal-2016 /movie/dangal-2016 /movie/the-invisible-guest-2017 /movie/the-invisible-guest-2017 /movie/scott-pilgrim-vs-the-world-2010 /movie/scott-pilgrim-vs-the-world-2010 /movie/david-attenborough-a-life-on-our-planet-2020 /movie/david-attenborough-a-life-on-our-planet-2020 /movie/a-silent-voice-2016 /movie/a-silent-voice-2016 /movie/13th-2016 /movie/13th-2016 /movie/the-dark-knight-2008 /movie/the-dark-knight-2008 /movie/icarus-2017 /movie/icarus-2017 /movie/roma-2018 /movie/roma-2018 /movie/black-mirror-bandersnatch-2018 /movie/black-mirror-bandersnatch-2018 /movie/inception-2010 /movie/inception-2010 /movie/the-two-popes-2019 /movie/the-two-popes-2019 /movie/the-trial-of-the-chicago-7-2020 /movie/the-trial-of-the-chicago-7-2020 /movie/to-all-the-boys-ive-loved-before-2018 /movie/to-all-the-boys-ive-loved-before-2018 /movie/pk-2014 /movie/pk-2014 /movie/the-social-dilemma-2020 /movie/the-social-dilemma-2020 /movie/fruitvale-station-2013 /movie/fruitvale-station-2013 /movie/the-ballad-of-buster-scruggs-2018 /movie/the-ballad-of-buster-scruggs-2018 /movie/the-dawn-wall-2018 /movie/the-dawn-wall-2018 /movie/dolemite-is-my-name-2019 /movie/dolemite-is-my-name-2019 /movie/okja-2017 /movie/okja-2017 /movie/article-15-2019 /movie/article-15-2019 /movie/jim-andy-the-great-beyond-featuring-a-very-special-contractually-obligated-mention-of-tony-clifton-2017 /movie/jim-andy-the-great-beyond-featuring-a-very-special-contractually-obligated-mention-of-tony-clifton-2017 /movie/mudbound-2017 /movie/mudbound-2017 /movie/the-croods-2013 /movie/the-croods-2013 /movie/enola-holmes-2020 /movie/enola-holmes-2020 /movie/the-end-of-evangelion-1997 /movie/the-end-of-evangelion-1997 /movie/miss-americana-2020 /movie/miss-americana-2020 /movie/fyre-the-greatest-party-that-never-happened-2019 /movie/fyre-the-greatest-party-that-never-happened-2019 /movie/the-platform-2019 /movie/the-platform-2019 /movie/swades-we-the-people-2004 /movie/swades-we-the-people-2004 /movie/the-king-2019 /movie/the-king-2019 /movie/pieces-of-a-woman-2020 /movie/pieces-of-a-woman-2020 /movie/the-departed-2006 /movie/the-departed-2006 /movie/django-unchained-2012 /movie/django-unchained-2012 /movie/i-am-mother-2019 /movie/i-am-mother-2019 /movie/disclosure-trans-lives-on-screen-2020 /movie/disclosure-trans-lives-on-screen-2020 /movie/bo-burnham-make-happy-2016 /movie/bo-burnham-make-happy-2016 /movie/haider-2014 /movie/haider-2014 /movie/virunga-2014 /movie/virunga-2014 /movie/john-mulaney-kid-gorgeous-at-radio-city-2018 /movie/john-mulaney-kid-gorgeous-at-radio-city-2018 /movie/the-half-of-it-2020 /movie/the-half-of-it-2020 /movie/the-square-2013 /movie/the-square-2013 /movie/shutter-island-2010 /movie/shutter-island-2010 /movie/snowden-2016 /movie/snowden-2016 /movie/i-lost-my-body-2019 /movie/i-lost-my-body-2019 /movie/the-boy-who-harnessed-the-wind-2019 /movie/the-boy-who-harnessed-the-wind-2019 /movies/source/netflix?offset=50 https://itunes.apple.com/us/app/reelgood-tv-guide-for-streaming/id1031391869 https://play.google.com/store/apps/details?id=com.reelgoodapp.reelgood&referrer=utm_source%3DReelgoodWebApp%26utm_medium%3DGetAppPopUp%26utm_term%3DGetAppPopUp%26utm_content%3DGetAppPopUp%26utm_campaign%3DGetAppPopUp%26anid%3Dadmob /roulette/netflix /swipe /browse/popular-movies /curated/popular-picks /source/free /all /source/netflix /new/netflix /source/hulu /new/hulu /source/hbo /new/hbo /source/amazon /new/amazon /sitemap /about /business/products/catalog/ /tos /privacy-policy https://blog.reelgood.com /careers /faq / https://itunes.apple.com/us/app/reelgood-tv-guide-for-streaming/id1031391869 https://play.google.com/store/apps/details?id=com.reelgoodapp.reelgood&referrer=utm_source%3DReelgoodWepApp%26utm_content%3DFooter%26utm_campaign%3DGetAndroidApp%26anid%3Dadmob
So this outputs all the links found. but misses the base url.
UPDATE: 2
movie url's are as follow: <a href="/movie/the-movie-name-2021">

I would use a combination of attribute = value selectors to target the elements which have the full url in the content attribute
from bs4 import BeautifulSoup
import requests
URL = "https://reelgood.com/movies/source/netflix"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
for link in soup.select('[itemprop=itemListElement] [itemprop=url]'):
print(link['content'])

python crawl one page

I tried to extract extract links (href) which start with a specific word, but it returns empty list even if I have a lot of links in the page source who satisfy the condition, I am definitely missing something, below is my code:
import requests
from bs4 import BeautifulSoup
import string
import os
import re
def extract_href_page(page):
soup = BeautifulSoup(page)
all_links = []
links = soup.find_all('a', pattern = re.compile(r'\w*first_word'))
# pattern = re.compile(r'\w*recette')
print(links)
for link in links:
all_links.append(link['href']) # Save href only, for example.
return all_links
for page_number in range(1, 63):
requete = requests.get ("https://www.website.com/pages/"+ "page".capitalize()+ "-" + str(page_number) + ".html")
page = requete.content
list_links = extract_href_page(page)
print(list_links)
for link in list_links:
print(link)

Try this:
import requests
from bs4 import BeautifulSoup
import string
import os
import re
def extract_href_page(page):
soup = BeautifulSoup(page)
all_links = []
links = soup.find_all('a', href=True)
# pattern = re.compile(r'\w*recette')
print(links)
for link in links:
if re.match(r"\w*first_word", link["href"], re.I):
all_links.append(link.get("href"))
...

Trying get titles from multiple pages in beautifulsoup

I am trying to get all blog titles from every page on a blog but so far am only able to produce output from the last page on that blog
from urllib.request import urlopen
from bs4 import BeautifulSoup
base_url = 'http://www.madame-love.com'
n = 10
for i in range(2, n+1):
html = urlopen(base_url + "/page/%d" % i)
page = BeautifulSoup(html.read(), 'html.parser')
for titles in page.findAll('h2'):
print(titles.string)

Try this. It will give you all the titles across different pages:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.madame-love.com/page/{}/'
for link in [base_url.format(page) for page in range(1,5)]: #just input the highest page number in place of 5
res = requests.get(link)
soup = BeautifulSoup(res.text, 'lxml')
for titles in soup.select('h2.entry-title a'):
print(titles.text)

Parsing a range of urls using Urllib2 or Beautifulsoup

I am trying to get data from a site that has the following form: "http://www.mysite.here?pageNo=2"
How do I get the html data from a consecutive range of pages using Urllib2 and/or BeautifulSoup? This code returns the html only for the first page.
import urllib2
from bs4 import BeautifulSoup
for x in range(1,450):
numb = str(x)
url = "http://www.mysite.here?pageNo="+numb
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
Print soup

On each iteration you create new variable with name soup.
You need to create storage for all pages - list with pages. And append to it on each iteration.
import urllib2
from bs4 import BeautifulSoup
pages = []
for x in range(1, 450):
numb = str(x)
url = "http://www.mysite.here?pageNo=" + numb
page = urllib2.urlopen(url).read()
pages.append(BeautifulSoup(page, "html.parser"))
print pages

You can create an array soup = [] and add to it with soup.append(soup).
If you want just one soup object, you need to add the contents at each step, for instance for the body's
soup = BeautifulSoup("<html><body></body></body>") # initialize soup
for x in range(1,450):
numb = str(x)
url = "http://www.mysite.here?pageNo="+numb
page = urllib2.urlopen(url).read()
tmpsoup = BeautifulSoup(page, "html.parser")
for element in tmpsoup.body:
soup.body.append(element)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How should I scrape these images without errors? - python

Related

Web Image Scraping not giving any output

Scrape url list from Reelgood.com

python crawl one page

Trying get titles from multiple pages in beautifulsoup

Parsing a range of urls using Urllib2 or Beautifulsoup

Categories

Resources