Scrape web sites with unique url (python) - python

I am currently working on a project of web scraping but i have difficulties with the url of the web site, because it's not changing when i'm going through the pages.
The website: https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail
My goal is to scrape all the buildings in the two pages.
The only way i can scrape the data is by using the inspect tool and copy the wrapper around all the ads.
This is my code:
from bs4 import BeautifulSoup
import requests
import csv
import string
import glob
#Grab the soup (content)
source = requests.get("https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
soup = BeautifulSoup(source.content, 'html.parser')
#Loop through all the ads on the page
for ad in soup.find_all('div', {"data-id":"templateThumbnailItem"}):
if (soup.find('div', {"class":"price"})):
#Get the address
address = ad.find('span', {"class":"address"})
address = address.findChild().text
address = address.strip()
#Get the district
district = ad.find('span', {"class":"address"})
district = district.findChildren()[1].text
district = district.strip()
#Get the type
typeBuilding = ad.find('span', {"class":"category"}).text
typeBuilding = typeBuilding.strip()
typeBuilding = typeBuilding[0:7].strip()
#Get the Price
price = ad.find('span', {"itemprop":"price"}).text
price = price.replace('$','')
price = price.replace(u'\xa0','')
price = int(str(price))
cnt = cnt + 1
print(f'Adresse: {address}, Quartier: {district}, Type: {typeBuilding}, Prix: {price}$')
Thank you for helping!

import requests
from bs4 import BeautifulSoup
import csv
def main(url):
with requests.Session() as req:
r = req.get(
"https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Address", "Quartier", "Type", "Price"])
for num in range(0, 40, 20):
data = {'startPosition': num}
r = req.post(url, json=data).json()
html = r["d"]["Result"]["html"]
soup = BeautifulSoup(html, 'html.parser')
prices = [format(int(price.get("content")), ',d') for price in soup.findAll(
"span", itemprop="price")]
block = soup.findAll("div", class_="location-container")
ty = [ty.div.get_text(strip=True) for ty in block]
add = [add.select_one(
"span.address div").text for add in block]
quartier = [quar.select_one(
"span.address div:nth-child(2)").text for quar in block]
final = zip(add, quartier, ty, prices)
writer.writerows(final)
main("https://www.centris.ca/Mvc/Property/GetInscriptions")
Output: View Online

Related

Beautiful Soup IMDB

I am trying to get the name of IMDB top movies . But I don't know how can I get the specifically movie names
Here is source code
The Shawshank Redemption
and here is my code
import requests
from bs4 import BeautifulSoup as bs
file = open("text-txt-file.txt", "w")
imdburl1 = "https://www.imdb.com/chart/top"
r = requests.get(imdburl1)
soup = bs(r.content, "lxml")
data = soup.find_all("table", {"class":"chart full-width"})
movietable = (data[0].contents) [len(data[0].contents) - 2]
movietable = movietable.find_all("tr")
for i in movietable:
filmtitles = i.find_all("td", {"class":"titleColumn"})
for j in filmtitles:
moviename = j.find_all("a")
print() # what to do ????
input()
Run loop on moviename then get the title
for title in moviename:
print(title.get('title')) # what to do ????
full code
import requests
from bs4 import BeautifulSoup as bs
file = open("text-txt-file.txt", "w")
imdburl1 = "https://www.imdb.com/chart/top"
r = requests.get(imdburl1)
soup = bs(r.content, "lxml")
data = soup.find_all("table", {"class": "chart full-width"})
movietable = (data[0].contents)[len(data[0].contents) - 2]
movietable = movietable.find_all("tr")
for i in movietable:
filmtitles = i.find_all("td", {"class": "titleColumn"})
for j in filmtitles:
moviename = j.find_all("a")
for title in moviename:
print(title.get('title')) # what to do ????

How to change this formula for python? Newbie to coding, any help is appreciated

Hi I have currently obtained this code from online. It currently gets the url and company information for the following tickers. Is there any way to update this code to instead to show the sector and industry information in replacement of the url and company information? Newbie to coding, so would appreciate any help :)
Code Below:
import bs4 as BeautifulSoup
from bs4 import SoupStrainer
import re
import urllib.request
import pandas as pd
import requests
symbols = ['SBUX', 'MET', 'CAT', 'JNJ', 'ORCL']
headers = {'User-agent': 'Mozilla/5.0'}
mySymbols = {}
for s in symbols:
vals = {}
url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(webpage.content)
title = soup.find("title")
tmp = title.get_text()
rxTitle = re.compile(r'\(.*$')
coName = rxTitle.sub("", tmp)
for link in soup.find_all('a', href=True):
try:
if link['target'] and "" == link['title']:
m = re.search('yahoo', link['href'], flags=re.IGNORECASE)
if None == m:
url = link['href']
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(webpage.content)
vals = {"company":coName, "url":link['href']}
print (s, vals)
mySymbols[s] = vals
except:
pass
Looking at one of those pages I see the Sector is in a span with 'class'='Fw(600)' and 'data-reactid'=21 and the industry with data-reactid=25, so you could use
sector = soup.find('span', {'class':'Fw(600)','data-reactid': '21'})
print(sector.next)
industry = soup.find('span', {'class':'Fw(600)','data-reactid': '25'})
print(industry.next)
The sector.next gets the stuff within the span instead of returning the entire thing.
A better approach that looks for the 'Sector' and 'Industry' spans and returns the subsequent span is fully coded below:
import bs4 as BeautifulSoup
import requests
def get_tags(url):
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(webpage.content)
title = soup.find("title")
results = {}
tmp = title.get_text()
results['title'] = tmp
spans = soup.findAll('span')
for i in range(len(spans)):
if spans[i] and spans[i].text == 'Sector':
sector = spans[i+1].text
results['Sector'] = sector
if spans[i] and spans[i].text == 'Industry':
industry = spans[i+1].text
results['Industry'] = industry
return results
headers = {'User-agent': 'Mozilla/5.0'}
symbols = ['SBUX', 'MET', 'CAT', 'JNJ', 'ORCL']
for s in symbols:
url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
results = get_tags(url)
print(results)

Review scraping form tripadvisor

I am new to web scraping in python3. I want to scrape the reviews of all the hotels in dubai but the problem is I can only scrape the hotel review which I describe in the url. Can anyone show me how I can get all of the hotel reviews without implicitly giving url of each hotel?
import requests
from bs4 import BeautifulSoup
importurl = 'https://www.tripadvisor.com/Hotel_Review-g295424-d302778-Reviews-Roda_Al_Bustan_Dubai_Airport-Dubai_Emirate_of_Dubai.html'
r = requests.get(importurl)
soup = BeautifulSoup(r.content, "lxml")
resultsoup = soup.find_all("p", {"class" : "partial_entry"})
#save the reviews to a test text file locally
for review in resultsoup:
review_list = review.get_text()
print(review_list)
with open('testreview.txt', 'w') as fid:
for review in resultsoup:
review_list = review.get_text()
fid.write(review_list)
you should find the index page of all hotel, get all the link into a list, than loop the url list to get comment.
import bs4, requests
index_pages = ('http://www.tripadvisor.cn/Hotels-g295424-oa{}-Dubai_Emirate_of_Dubai-Hotels.html#ACCOM_OVERVIEW'.format(i) for i in range(0, 540, 30))
urls = []
with requests.session() as s:
for index in index_pages:
r = s.get(index)
soup = bs4.BeautifulSoup(r.text, 'lxml')
url_list = [i.get('href') for i in soup.select('.property_title')]
urls.append(url_list)
out:
len(urls): 540

How can I make the output with pair of list : content in my python code?

I have been developing a python web-crawler for this website. I made two functions, which works well as separately.
One is to collect the list of stocks and
Another is to collect the content data of each list.
I would like to make the output of my code with pairs of
"list#1/content#1",
"list#2/content#2",
"list#3/content#3",
What needs to be modified in my code in order to achieve this?
Thanks.
from bs4 import BeautifulSoup
import urllib.request
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
#print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
#print('#',count, lst_title, lst_photo_url, lst_link, lst_price)
return lst_link
def fetch_post_content(lst_link):
URL = BASE_PAGE + lst_link
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
#Basic Information
table = soup.find('div', class_='rightarea')
# Number, Year, Mileage, Gas Type, Color, Accident
content_table1 = table.find_all('div')[0]
dds = content_table1.find_all('dd')
for dd in dds:
car_span_t = dd.find_all('span', {'class': 't'})[0]
car_span_s = dd.find_all('span', {'class': 's'})[0]
#print(car_span_t.text, ':', car_span_s.text)
# Seller Information
content_table2 = table.find_all('div')[1]
dds2 = content_table2.find_all('dd')
for dd2 in dds2:
seller_span_t = dd.find_all('span', {'class': 't'})[0]
seller_span_s = dd.find_all('span', {'class': 's'})[0]
#print(seller_span_t.text, ':', seller_span_s.text)
return dds

Python write of scraping data to csv file

I wrote simple code which scrape data from website but i'm struggling to save all rows to csv file. Finished script save only one row - it's last occurance in loop.
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Company", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
#print title, company, ,price_netto
f.writerow([title.encode("utf-8"), company, price_netto, ])
Important is to save data to concurrent columns
#PadraicCunningham This is my whole script:
import requests
from bs4 import BeautifulSoup
import csv
url_klocki = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
r = requests.get(url_klocki)
soup = BeautifulSoup(r.content, "html.parser")
def main_spider(max_page):
page = 1
while page <= max_page:
url = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
source_code = requests.get(url)
soup = BeautifulSoup(source_code.content, "html.parser")
for link in soup.find_all('article', class_='small-product'):
url = "http://www.selgros24.pl"
a = link.findAll('a')[0].get('href')
href = url + a
#print href
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Comapny", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
print title, company, price_netto
f.writerow([title.encode("utf-8"), company, price_netto])
main_spider(1)
The problem is that you are opening the output file in get_single_item_data, and it is getting closed when that function returns and f goes out of scope.
You want to pass an open file in to get_single_item_data so multiple rows will be written.

Categories

Resources