Web Scraping loop structure issues - python

I'm currently writing some code to web scrape from AutoTrader as a practice project. I'm having trouble printing the results I need.
The desired output should be:
Car 1
Specs Car 1
Instead, it's
Car 1
Specs Car 1
Specs Car 2
Specs Car X
car 2
Where in my looping structure am I going wrong?
from bs4 import BeautifulSoup
import requests
page_link = ("https://www.autotrader.co.uk/car-search?sort=price-asc&radius=1500&postcode=lu15jq&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&make=AUDI&model=A5&price-to=8500&year-from=2008&maximum-mileage=90000&transmission=Automatic&exclude-writeoff-categories=on")
LN = 0
r = requests.get(page_link)
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("h2",{"class":"listing-title title-wrap"})
all2 = soup.find_all('ul',{"class" :'listing-key-specs '})
The above block of code is fine. The below block prints the outputs.
LN = -1
ListTotal = len(all)
for item in all:
if LN <= ListTotal:
LN += 1
print(item.find("a", {"class": "js-click-handler listing-fpa-link"}).text)
for carspecs in all2:
print (carspecs.text)
else:
break
Thanks

Because you're printing every carspec in all2 every time
all = ...
all2 = ...
for item in all:
...
for carspecs in all2:
# will print everything in all2 on each iteration of all
print (carspecs.text)
I suspect you want
for item, specs in zip(all, all2):
...
print(specs.text)
Just FYI I cleaned up you code with better logic and names, got rid of superfluous stuff and made it obey the python style guide
import requests
from bs4 import BeautifulSoup
page_link = ("https://www.autotrader.co.uk/car-search?sort=price-asc&"
"radius=1500&postcode=lu15jq&onesearchad=Used&"
"onesearchad=Nearly%20New&onesearchad=New&make=AUDI&model=A5"
"&price-to=8500&year-from=2008&maximum-mileage=90000"
"&transmission=Automatic&exclude-writeoff-categories=on")
request = requests.get(page_link)
conn = request.content
soup = BeautifulSoup(conn, "html.parser")
# don't overload the inbuilt `all`
cars = soup.find_all("h2", {"class":"listing-title title-wrap"})
cars_specs = soup.find_all('ul', {"class" :'listing-key-specs '})
for car, specs in zip(cars, cars_specs):
# your logic with regards to the `LN` variable did absolutely nothing
print(car.find("a", {"class": "js-click-handler listing-fpa-link"}))
print(specs.text)

Related

BeautifulSoup Python3 append multiple links output to single list

import requests
from bs4 import BeautifulSoup
import re
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span():
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
t1 = str(soup.findAll("span", id=re.compile('^msg_')))
print(t1)
t2 = [x for x in re.findall(r'\d+\.\d+', t1)]
t2.sort(key=float, reverse=True)
t3 = "https://bitcointalk.org/index.php?topic"
for hn in t2:
if len(hn) >= 9:
hn = '{}={}'.format(t3, hn)
print(hn)
get_span()
Hello!
My code iterates items in link, then finds the span with
id=msg_, then finds all numbers in id=msg_, sorts them in
descending order. Problem is that it iterates first item and gives output
of it, then second item and so on, so out put contains 3 lists. So it
sorts items separately.. I want to get a output with all 3 items from links
sorted in one list.
You can use list.extend to add items to list and then sort the final list before returning it.
For example:
import re
import requests
from bs4 import BeautifulSoup
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span(links):
rv = []
r = re.compile(r'\d{7,}\.\d+')
for url in links:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rv.extend(a['href'] for a in soup.select('span[id^="msg_"] > a') if r.search(a['href']))
return sorted(rv, key=lambda k: float(r.search(k).group(0)), reverse=True)
all_links = get_span(links)
# print links on screen:
for link in all_links:
print(link)
Prints:
https://bitcointalk.org/index.php?topic=5255494.0
https://bitcointalk.org/index.php?topic=5255416.0
https://bitcointalk.org/index.php?topic=5255389.0
https://bitcointalk.org/index.php?topic=5255376.0
https://bitcointalk.org/index.php?topic=5255316.0
https://bitcointalk.org/index.php?topic=5254720.0
https://bitcointalk.org/index.php?topic=5254480.0
https://bitcointalk.org/index.php?topic=5254448.0
https://bitcointalk.org/index.php?topic=5254287.0
https://bitcointalk.org/index.php?topic=5252504.0
https://bitcointalk.org/index.php?topic=5251621.0
https://bitcointalk.org/index.php?topic=5250998.0
https://bitcointalk.org/index.php?topic=5250388.0
https://bitcointalk.org/index.php?topic=5250185.0
https://bitcointalk.org/index.php?topic=5248406.0
https://bitcointalk.org/index.php?topic=5247112.0
... and so on.
EDIT: If you want to show link text n
ext to url, you can use this example:
import re
import requests
from bs4 import BeautifulSoup
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span(links):
rv = []
r = re.compile(r'\d{7,}\.\d+')
for url in links:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rv.extend((a['href'], a.text) for a in soup.select('span[id^="msg_"] > a') if r.search(a['href']))
return sorted(rv, key=lambda k: float(r.search(k[0]).group(0)), reverse=True)
all_links = get_span(links)
# print links on screen:
for link, text in all_links:
print('{} {}'.format(link, text))
Prints:
https://bitcointalk.org/index.php?topic=5255494.0 NUL Token - A new hyper-deflationary experiment! Airdrop!
https://bitcointalk.org/index.php?topic=5255416.0 KEEP NETWORK - A privacy layer for Ethereum
https://bitcointalk.org/index.php?topic=5255389.0 [ANN] ICO - OBLICHAIN | Blockchain technology at the service of creative genius
https://bitcointalk.org/index.php?topic=5255376.0 UniChain - The 4th Generation Blockchain Made For The Smart Society 5.0
https://bitcointalk.org/index.php?topic=5255316.0 INFINITE RICKS ! First Multiverse Cryptocurrency ! PoS 307%
https://bitcointalk.org/index.php?topic=5254720.0 [GMC] GameCredits - Unofficial & Unmoderated for Censored Posts.
https://bitcointalk.org/index.php?topic=5254480.0 [ANN] [BTCV] Bitcoin VaultA higher standard in security
https://bitcointalk.org/index.php?topic=5254448.0 [ANN] Silvering (SLVG) token - New Silver Asset Backed Cryptocurrency
... and so on.

trying to loop through a list of urls and scrape each page for text

I'm having an issue. It loops through the list of URLS, but it's not adding the text content of each page scraped to the presults list.
I haven't gotten to the raw text processing yet. I'll probably make a question for that once I get there if I can't figure out.
What is wrong here? The length of presults remains at 1 even though it seems to be looping through the list of urls for the scrape...
Here's part of the code I'm having an issue with:
counter=0
for xa in range(0,len(qresults)):
pageURL=qresults[xa].format()
pageresp= requests.get(pageURL, headers=headers)
if pageresp.status_code==200:
print(pageURL)
psoup=BeautifulSoup(pageresp.content, 'html.parser')
presults=[]
para=psoup.text
presults.append(para)
print(len(presults))
else: print("Could not reach domain")
print(len(presults))
Your immediate problem is here:
presults=[]
para=psoup.text
presults.append(para)
On every for iteration, you replace your existing presults list with the empty list and add one item. On the next iteration, you again wipe out the previous result.
Your initialization must be done only once and that before the loop:
presults = []
for xa in range(0,len(qresults)):
Ok, I don't even see you looping through any URLs here, but below is a generic example of how this kind of request can be achieved.
import requests
from bs4 import BeautifulSoup
base_url = "http://www.privredni-imenik.com/pretraga?abcd=&keyword=&cities_id=0&category_id=0&sub_category_id=0&page=1"
current_page = 1
while current_page < 200:
print(current_page)
url = base_url + str(current_page)
#current_page += 1
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'jobs-item'})
for title in firme:
title1 = title.findAll('h6')[0].text
print(title1)
adresa = title.findAll('div', {'class': 'description'})[0].text
print(adresa)
kontakt = title.findAll('div', {'class': 'description'})[1].text
print(kontakt)
print('\n')
page_line = "{title1}\n{adresa}\n{kontakt}".format(
title1=title1,
adresa=adresa,
kontakt=kontakt
)
current_page += 1

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

HTML Parsing gives no response

I'm trying to parse a web page, and that's my code:
from bs4 import BeautifulSoup
import urllib2
openurl = urllib2.urlopen("http://pastebin.com/archive/Python")
read = BeautifulSoup(openurl.read())
soup = BeautifulSoup(openurl)
x = soup.find('ul', {"class": "i_p0"})
sp = soup.findAll('a href')
for x in sp:
print x
I really with I could be more specific but as the title says, it gives me no response. No errors, nothing.
First of all, omit the line read = BeautifulSoup(openurl.read()).
Also, the line x = soup.find('ul', {"class": "i_p0"}) doesn't actually make any difference, because you are reusing x variable in the loop.
Also, soup.findAll('a href') doesn't find anything.
Also, instead of old-fashioned findAll(), there is a find_all() in BeautifulSoup4.
Here's the code with several alterations:
from bs4 import BeautifulSoup
import urllib2
openurl = urllib2.urlopen("http://pastebin.com/archive/Python")
soup = BeautifulSoup(openurl)
sp = soup.find_all('a')
for x in sp:
print x['href']
This prints the values of href attribute of all links on the page.
Hope that helps.
I altered a couple of lines in your code and I do get a response, not sure if that is what you want though.
Here:
openurl = urllib2.urlopen("http://pastebin.com/archive/Python")
soup = BeautifulSoup(openurl.read()) # This is what you need to use for selecting elements
# soup = BeautifulSoup(openurl) # This is not needed
# x = soup.find('ul', {"class": "i_p0"}) # You don't seem to be making a use of this either
sp = soup.findAll('a')
for x in sp:
print x.get('href') #This is to get the href
Hope this helps.

I need help web-scraping

So I wanted to scrape visualizations from visual.ly, however right now I do not understand how the "show more" button works. As of now, my code will get the image link, the text next to the image, and the link of the page. I was wondering how the "show more" button functions, because I was going to try to loop through using the number of pages. As of now I do not know how i would loop through each one individually. Any ideas on how I could loop through and go on to get more images than they originally show you????
from BeautifulSoup import BeautifulSoup
import urllib2
import HTMLParser
import urllib, re
counter = 1
columnno = 1
parser = HTMLParser.HTMLParser()
soup = BeautifulSoup(urllib2.urlopen('http://visual.ly/?view=explore& type=static#v2_filter').read())
image = soup.findAll("div", attrs = {'class': 'view-mode-wrapper'})
if columnno < 4:
column = image[0].findAll("div", attrs = {'class': 'v2_grid_column'})
columnno += 1
else:
column = image[0].findAll("div", attrs = {'class': 'v2_grid_column last'})
visualizations = column[0].findAll("div", attrs = {'class': '0 v2_grid_item viewmode-item'})
getImage = visualizations[0].find("a")
print counter
print getImage['href']
soup1 = BeautifulSoup(urllib2.urlopen(getImage['href']).read())
theImage = soup1.findAll("div", attrs = {'class': 'ig-graphic-wrapper'})
text = soup1.findAll("div", attrs = {'class': 'ig-content-right'})
getText = text[0].findAll("div", attrs = {'class': 'ig-description right-section first'})
imageLink = theImage[0].find("a")
print imageLink['href']
print getText
for row in image:
theImage = image[0].find("a")
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
counter += 1
You cannot use a urllib-parser combo here because it uses javascript to load more content. In order to do this you will need a full force browser emulator (with javascript support). I have never used Selenium before, but I have heard that it does this, as well as has a python binding
However, I have found that it uses a very predictable form
http://visual.ly/?page=<page_number>
for its GET requests. Perhaps an easier way would be to go under
<div class="view-mode-wrapper">...</div>
to parse the data (using the above url format). After all, ajax requests must go to a location.
Then you could do
for i in xrange(<whatever>):
url = r'http://visual.ly/?page={pagenum}'.format(pagenum=i)
#do whatever you want from here

Categories

Resources