Beautifulsoup append html data

Beautifulsoup append html data - python

``Could someone tell me how to append all the datas in my var ?
name_company = soup.find_all("h1")
name_data = []
for item in name_company:
name_data.append(item.string)
why when I print(name_data), there is only the last h1 who get scrap in my tuple ?
Thanks !
EDIT :
Here's my simplified code :
def robot_crawl(max_pages):
page = 1
while page < max_pages:
url = "http://tel.local.ch/en/q/Vaud%20(Canton)/imprimerie.html?page=" + str(page)
get_url = requests.get(url)
get_text = get_url.text
#take the text of the request
soup = BeautifulSoup(get_text, "html.parser")
for link in soup.find_all('a', {'class': "details-entry-title-link"}):
href = link.get('href')
bot_get_data(href)
page += 1
def bot_get_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
name_company = soup.find_all("h1")
name_data = []
for item in name_company:
name_data.append(item.string)
print(item.string) #text or string ? don't know the diff
excel_data_transfer(name_data)
def excel_data_transfer(dataname):
workbook = xlsxwriter.Workbook('datasccraping3.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write_column('A1', dataname)
workbook.close()

Related

Parse csv and append if no match found

please help
I want to check csv first on "final link" below in code and append csv only if there is no match found:
Thanks in advance
for i in range(0,1):
with open('plaid.csv', 'a') as f_object:
writer_object = csv.writer(f_object)
url = f'https://plaidonline.com/products?closeout=True&page={i}'
r = requests.get(url=url, headers=headers).content
soup = BeautifulSoup(r, 'lxml')
product_block = soup.select('div', class_='col-xs-12 col-md-8 col-lg-9 ')
for i in product_block:
href = i.find_all(class_='tile-link', href=True)
for link in href:
link = link.get('href')
final_link = 'https://plaidonline.com/'+link
if final_link not in product_urls:
product_urls.append(final_link)
# print(final_link)
writer_object.writerow(final_link)
f_object.close()

Now trying to extract just `href` from class with BeautifulSoup and Python 3.

I can't seem to get this to work. I have my script going to a site and scraping the data into my info variable but when I am trying to pull out the href from a specific class I am getting None or it just isn't working when I try all kinds of different combos. Where am I screwing up? When I scrape it into my info variable, there is a class='business-name' and the href inside of it.
import requests
from bs4 import BeautifulSoup
count = 0
search_terms = "Bars"
location = "New Orleans, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search_terms, 'geo_location_terms': location}
page = requests.get(url, params=q)
url_link = page.url
page_num = str(count)
searched_page = url_link + '&page=' + str(count)
page = requests.get(searched_page)
soup = BeautifulSoup(page.text, 'html.parser')
info = soup.findAll('div', {'class': 'info'})
for each_business in info:
# This is the spot that is broken. I can't make it work!
yp_bus_url = each_business.get('class_','business-name')['href']
print(yp_bus_url)

You can also do this:
import requests
from bs4 import BeautifulSoup
count = 0
search_terms = "Bars"
location = "New Orleans, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search_terms, 'geo_location_terms': location}
page = requests.get(url, params=q)
url_link = page.url
page_num = str(count)
searched_page = url_link + '&page=' + str(count)
page = requests.get(searched_page)
soup = BeautifulSoup(page.text, 'html.parser')
With the change here (be sure to assign the list to whatever you want):
#info = soup.findAll('div', {'class': 'info'})
info = soup.select("[class~=business-name]")
[i.get('href') for i in info]
Returns:
['/new-orleans-la/mip/upperline-restaurant-526381149?lid=1001797484770',
'/new-orleans-la/mip/brunos-tavern-451091659?lid=451091659',
'/new-orleans-la/mip/lafittes-blacksmith-shop-bar-19195002?lid=19195002',
'/new-orleans-la/mip/johnny-whites-pub-grill-5198728?lid=5198728',
'/new-orleans-la/mip/chart-room-6924442?lid=6924442',
'/new-orleans-la/mip/golden-lantern-8517918?lid=8517918',
'/new-orleans-la/mip/ryans-irish-pub-inc-851820?lid=851820',
'/new-orleans-la/mip/d-b-a-2084747?lid=2084747',
'/new-orleans-la/mip/parlays-13663513?lid=13663513',
'/new-orleans-la/mip/apple-barrel-18379645?lid=18379645',
'/new-orleans-la/mip/snake-jakes-xmas-club-lounge-4531421?lid=4531421',
'/new-orleans-la/mip/port-of-call-394043?lid=394043',
'/new-orleans-la/mip/coops-place-14511722?lid=14511722',
'/new-orleans-la/mip/twi-ro-pa-466224645?lid=466224645',
'/new-orleans-la/mip/krazy-korner-11594425?lid=11594425',
'/new-orleans-la/mip/bourbon-o-480103567?lid=480103567',
'/new-orleans-la/mip/hi-ho-lounge-458821090?lid=458821090',.....]

I think this is, what you need:
for each_business in info:
yp_bus_url = each_business.find('a', {'class': 'business-name'}).get('href')
print(yp_bus_url)

The below code should work for you:
import requests
from bs4 import BeautifulSoup
count = 0
search_terms = "Bars"
location = "New Orleans, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search_terms, 'geo_location_terms': location}
page = requests.get(url, params=q)
url_link = page.url
page_num = str(count)
searched_page = url_link + '&page=' + str(count)
page = requests.get(searched_page)
soup = BeautifulSoup(page.text, 'html.parser')
info = soup.findAll('div', {'class': 'info'})
for each_business in info:
# Your Fix here
for a in each_business.find_all('a', href=True):
print("Found the URL:", a['href'])

My BeautifulSoup spider only crawls 2 pages not all the pages

Any help would be appreciated as I am new to python. I have created the below Web Crawler but it doesn't crawl all the pages, just 2 pages. What changes need to be made for it to crawl all the pages?
See def trade_spider(max_pages) loop and at the bottom i have trade_spider(18) which should loop all pages.
Thanks for your help.
import csv
import re
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline= "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page += 1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
address = item_name.get_text(strip=True)
writer.writerow([address])
trade_spider(18)

Your code is working fine, it does crawl all the pages (though there are just 14 pages not 18). It seems like your trying to scrape street address , in that case the second function is unnecessary and is only making your crawler slow by calling requests.get() too many times. I've modified the code a little but this one is faster.
import csv
import re
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline="")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
furl = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
# Changed the class' value
for link in soup.findAll('a', {'class': 'listing-results-address'}):
#href = "http://www.zoopla.co.uk" + link.get('href')
#title = link.string
#get_single_item_data(href)
address = link.get_text()
print (address) # Just to check it is working fine.
writer.writerow([address])
print (page)
page += 1
# Unnecessary code
'''def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
address = item_name.get_text(strip=True)
writer.writerow([address])'''
trade_spider(18)

python beautiful soup output into excel

I am trying to get the output from the python script into excel. The script works fine in Python, but when I try and do the import CSV and writerow rule it doesn't work. It says price not defined in writerow and how would I print multiple items. Any help would be appreciated.
import csv
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline = "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/manchester/?identifier=manchester&q=manchester&search_source=home&radius=0&pn=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
print(item_name.get_text(strip=True))
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
print(item_fame.get_text(strip=True))
writer.writerow(price)
trade_spider(1)

The object price is not defined anywhere in your script outside of the function get_single_item_data. Outside of that function your code cannot recognize any object with that name. Also, get_single_item_data does not return anything from the BeautifulSoup object. It only prints it. You should rewrite your function to be something like this:
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
#create list to contain addresses
addresses = []
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
#add each address to the list
addresses.append(address)
print(item_name.get_text(strip=True))
#create list for prices
prices = []
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
#add prices to list
prices.append(price)
print(item_fame.get_text(strip=True))
#alter the code to return the data structure you prefer.
return([addresses,prices])

Python write of scraping data to csv file

I wrote simple code which scrape data from website but i'm struggling to save all rows to csv file. Finished script save only one row - it's last occurance in loop.
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Company", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
#print title, company, ,price_netto
f.writerow([title.encode("utf-8"), company, price_netto, ])
Important is to save data to concurrent columns

#PadraicCunningham This is my whole script:
import requests
from bs4 import BeautifulSoup
import csv
url_klocki = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
r = requests.get(url_klocki)
soup = BeautifulSoup(r.content, "html.parser")
def main_spider(max_page):
page = 1
while page <= max_page:
url = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
source_code = requests.get(url)
soup = BeautifulSoup(source_code.content, "html.parser")
for link in soup.find_all('article', class_='small-product'):
url = "http://www.selgros24.pl"
a = link.findAll('a')[0].get('href')
href = url + a
#print href
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Comapny", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
print title, company, price_netto
f.writerow([title.encode("utf-8"), company, price_netto])
main_spider(1)

The problem is that you are opening the output file in get_single_item_data, and it is getting closed when that function returns and f goes out of scope.
You want to pass an open file in to get_single_item_data so multiple rows will be written.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Beautifulsoup append html data - python

Related

Parse csv and append if no match found

Now trying to extract just `href` from class with BeautifulSoup and Python 3.

My BeautifulSoup spider only crawls 2 pages not all the pages

python beautiful soup output into excel

Python write of scraping data to csv file

Categories

Resources