Any help would be appreciated as I am new to python. I have created the below Web Crawler but it doesn't crawl all the pages, just 2 pages. What changes need to be made for it to crawl all the pages?
See def trade_spider(max_pages) loop and at the bottom i have trade_spider(18) which should loop all pages.
Thanks for your help.
import csv
import re
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline= "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page += 1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
address = item_name.get_text(strip=True)
writer.writerow([address])
trade_spider(18)
Your code is working fine, it does crawl all the pages (though there are just 14 pages not 18). It seems like your trying to scrape street address , in that case the second function is unnecessary and is only making your crawler slow by calling requests.get() too many times. I've modified the code a little but this one is faster.
import csv
import re
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline="")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
furl = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
# Changed the class' value
for link in soup.findAll('a', {'class': 'listing-results-address'}):
#href = "http://www.zoopla.co.uk" + link.get('href')
#title = link.string
#get_single_item_data(href)
address = link.get_text()
print (address) # Just to check it is working fine.
writer.writerow([address])
print (page)
page += 1
# Unnecessary code
'''def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
address = item_name.get_text(strip=True)
writer.writerow([address])'''
trade_spider(18)
Related
I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.
The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.
Any help is much appreciated.
# Reviews Scrape
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'EXAMPLE.COM'
# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# HTML Parsing
page_soup = soup(page_html, "html.parser")
# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})
filename = "compreviews.csv"
f = open(filename, "w")
headers = "Score, Title, Content\n"
f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
# score = container.div.div.span["data-score"]
score = container.findAll("span",{"data-score":True})
user_score = score[0].text.strip()
title_review = container.findAll("b",{"class":"jdgm-rev__title"})
user_title = title_review[0].text.strip()
content_review = container.findAll("div",{"class":"jdgm-rev__body"})
user_content = content_review[0].text.strip()
print("user_score:" + score[0]['data-score'])
print("user_title:" + user_title)
print("user_content:" + user_content)
f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")
f.close()
The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:
import requests
from bs4 import BeautifulSoup as bs
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
Example dataframe to csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
authors = []
titles = []
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc
headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )
I am trying to get the output from the python script into excel. The script works fine in Python, but when I try and do the import CSV and writerow rule it doesn't work. It says price not defined in writerow and how would I print multiple items. Any help would be appreciated.
import csv
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline = "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/manchester/?identifier=manchester&q=manchester&search_source=home&radius=0&pn=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
print(item_name.get_text(strip=True))
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
print(item_fame.get_text(strip=True))
writer.writerow(price)
trade_spider(1)
The object price is not defined anywhere in your script outside of the function get_single_item_data. Outside of that function your code cannot recognize any object with that name. Also, get_single_item_data does not return anything from the BeautifulSoup object. It only prints it. You should rewrite your function to be something like this:
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
#create list to contain addresses
addresses = []
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
#add each address to the list
addresses.append(address)
print(item_name.get_text(strip=True))
#create list for prices
prices = []
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
#add prices to list
prices.append(price)
print(item_fame.get_text(strip=True))
#alter the code to return the data structure you prefer.
return([addresses,prices])
I wrote simple code which scrape data from website but i'm struggling to save all rows to csv file. Finished script save only one row - it's last occurance in loop.
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Company", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
#print title, company, ,price_netto
f.writerow([title.encode("utf-8"), company, price_netto, ])
Important is to save data to concurrent columns
#PadraicCunningham This is my whole script:
import requests
from bs4 import BeautifulSoup
import csv
url_klocki = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
r = requests.get(url_klocki)
soup = BeautifulSoup(r.content, "html.parser")
def main_spider(max_page):
page = 1
while page <= max_page:
url = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
source_code = requests.get(url)
soup = BeautifulSoup(source_code.content, "html.parser")
for link in soup.find_all('article', class_='small-product'):
url = "http://www.selgros24.pl"
a = link.findAll('a')[0].get('href')
href = url + a
#print href
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Comapny", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
print title, company, price_netto
f.writerow([title.encode("utf-8"), company, price_netto])
main_spider(1)
The problem is that you are opening the output file in get_single_item_data, and it is getting closed when that function returns and f goes out of scope.
You want to pass an open file in to get_single_item_data so multiple rows will be written.
This is the code that I wrote.
import requests
from bs4 import BeautifulSoup
def code_search(max_pages):
page = 1
while page <= max_pages:
url = 'http://kindai.ndl.go.jp/search/searchResult?searchWord=朝鲜&facetOpenedNodeIds=&featureCode=&viewRestrictedList=&pageNo=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
page += 1
code_search(2)
My pycharm version is pycharm-community-5.0.3 for mac.
It just says:
"Process finished with exit code 0"
But there should be some results if I have wrote the code accordingly...
Please help me out here!
You have no print statements - so the program doesn't output anything.
Add some print statements. For example, if you output the link, do this:
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
print(href)
page += 1
The answer depends on what you want to archieve with the web crawler. The first observation is that nothing is printed.
The following code prints the URL and all links found on the URL.
import requests
from bs4 import BeautifulSoup
def code_search(max_pages):
page = 1
while page <= max_pages:
url = 'http://kindai.ndl.go.jp/search/searchResult?searchWord=朝鲜&facetOpenedNodeIds=&featureCode=&viewRestrictedList=&pageNo=' + str(page)
print("Current URL:", url)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
print("Found URL:", href)
page += 1
code_search(2)
It is also possible to let the method return all found URLs and then print the results:
import requests
from bs4 import BeautifulSoup
def code_search(max_pages):
page = 1
urls = []
while page <= max_pages:
url = 'http://kindai.ndl.go.jp/search/searchResult?searchWord=朝鲜&facetOpenedNodeIds=&featureCode=&viewRestrictedList=&pageNo=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
urls.append(href)
page += 1
return urls
print("Found URLs:", code_search(2))
The page I am trying to crawl is http://www.boxofficemojo.com/yearly/chart/?page=1&view=releasedate&view2=domestic&yr=2013&p=.htm. Specifically, I am focusing on this page right now: http://www.boxofficemojo.com/movies/?id=ironman3.htm.
For each of the movies on the first link, I want to get the Genre, Runtime, MPAA Rating, Foreign Gross, and Budget. I am having trouble getting this because there is no identifying tag on the information. What I have so far:
import requests
from bs4 import BeautifulSoup
from urllib2 import urlopen
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
title = link.string
print title, href
get_single_item_data(href)
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
print soup.find_all("Genre: ")
for person in soup.select('td > font > a[href^=/people/]'):
print person.string
trade_spider(1)
So far, this retrieves all the titles of the movies from the original page, their link, and a list of the actors/people/directors etc. for each movie. Right now I am trying to get the Genre of the movie.
I tried to approach this in a similar way as the
"for person in soup.select('td > font > a[href^=/people/]'):
print person.string"
line, but this isn't a link, it is only text, so it is not working.
How can I get this data for each of the movies?
Find the Genre: text and get the next sibling:
soup.find(text="Genre: ").next_sibling.text
Demo:
In [1]: import requests
In [2]: from bs4 import BeautifulSoup
In [3]: response = requests.get("http://www.boxofficemojo.com/movies/?id=ironman3.htm")
In [4]: soup = BeautifulSoup(response.content)
In [5]: soup.find(text="Genre: ").next_sibling.text
Out[5]: u'Action / Adventure'