python beautiful soup output into excel - python

I am trying to get the output from the python script into excel. The script works fine in Python, but when I try and do the import CSV and writerow rule it doesn't work. It says price not defined in writerow and how would I print multiple items. Any help would be appreciated.
import csv
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline = "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/manchester/?identifier=manchester&q=manchester&search_source=home&radius=0&pn=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
print(item_name.get_text(strip=True))
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
print(item_fame.get_text(strip=True))
writer.writerow(price)
trade_spider(1)

The object price is not defined anywhere in your script outside of the function get_single_item_data. Outside of that function your code cannot recognize any object with that name. Also, get_single_item_data does not return anything from the BeautifulSoup object. It only prints it. You should rewrite your function to be something like this:
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
#create list to contain addresses
addresses = []
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
#add each address to the list
addresses.append(address)
print(item_name.get_text(strip=True))
#create list for prices
prices = []
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
#add prices to list
prices.append(price)
print(item_fame.get_text(strip=True))
#alter the code to return the data structure you prefer.
return([addresses,prices])

Related

Why do I get output from one but not the other?

from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
def correct_url(url1):
if not url1.startswith('https://www.parliament.gov.sg'):
url1 = f'https://www.parliament.gov.sg{url1}'
return url1
url_to_scrape = 'https://www.parliament.gov.sg/mps/list-of-current-mps'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
for link in links:
url = link
member_info = 'mp-designation-wrap'
**member_info = 'mp-constituency-wrap'**
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt1 = soup.find('div', attrs={'class': member_info})
textoutput = txt1.text
print(textoutput)
break
I'm trying to separate the different categories to use save separately, however, I only get output when using the member_info = 'mp-designation-wrap' and I get a AttributeError: 'NoneType' object has no attribute 'text' when using 'mp-constituency-wrap'.
I do not understand why it is giving me different results and it would be great if someone could help me understand why it is so and point me in the right direction
Reason why you get this error is, that the element you try to select do not exist in some of your resources, so you have to check that before calling .text.
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
text1 = e.text if (e := soup.find('div', attrs={'class': 'mp-designation-wrap'})) else None
text2 = e.text if (e := soup.find('div', attrs={'class': 'mp-constituency-wrap'})) else None
print(text2)

My BeautifulSoup spider only crawls 2 pages not all the pages

Any help would be appreciated as I am new to python. I have created the below Web Crawler but it doesn't crawl all the pages, just 2 pages. What changes need to be made for it to crawl all the pages?
See def trade_spider(max_pages) loop and at the bottom i have trade_spider(18) which should loop all pages.
Thanks for your help.
import csv
import re
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline= "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page += 1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
address = item_name.get_text(strip=True)
writer.writerow([address])
trade_spider(18)
Your code is working fine, it does crawl all the pages (though there are just 14 pages not 18). It seems like your trying to scrape street address , in that case the second function is unnecessary and is only making your crawler slow by calling requests.get() too many times. I've modified the code a little but this one is faster.
import csv
import re
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline="")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
furl = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
# Changed the class' value
for link in soup.findAll('a', {'class': 'listing-results-address'}):
#href = "http://www.zoopla.co.uk" + link.get('href')
#title = link.string
#get_single_item_data(href)
address = link.get_text()
print (address) # Just to check it is working fine.
writer.writerow([address])
print (page)
page += 1
# Unnecessary code
'''def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
address = item_name.get_text(strip=True)
writer.writerow([address])'''
trade_spider(18)

Python write of scraping data to csv file

I wrote simple code which scrape data from website but i'm struggling to save all rows to csv file. Finished script save only one row - it's last occurance in loop.
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Company", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
#print title, company, ,price_netto
f.writerow([title.encode("utf-8"), company, price_netto, ])
Important is to save data to concurrent columns
#PadraicCunningham This is my whole script:
import requests
from bs4 import BeautifulSoup
import csv
url_klocki = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
r = requests.get(url_klocki)
soup = BeautifulSoup(r.content, "html.parser")
def main_spider(max_page):
page = 1
while page <= max_page:
url = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
source_code = requests.get(url)
soup = BeautifulSoup(source_code.content, "html.parser")
for link in soup.find_all('article', class_='small-product'):
url = "http://www.selgros24.pl"
a = link.findAll('a')[0].get('href')
href = url + a
#print href
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Comapny", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
print title, company, price_netto
f.writerow([title.encode("utf-8"), company, price_netto])
main_spider(1)
The problem is that you are opening the output file in get_single_item_data, and it is getting closed when that function returns and f goes out of scope.
You want to pass an open file in to get_single_item_data so multiple rows will be written.

Beautifulsoup append html data

``Could someone tell me how to append all the datas in my var ?
name_company = soup.find_all("h1")
name_data = []
for item in name_company:
name_data.append(item.string)
why when I print(name_data), there is only the last h1 who get scrap in my tuple ?
Thanks !
EDIT :
Here's my simplified code :
def robot_crawl(max_pages):
page = 1
while page < max_pages:
url = "http://tel.local.ch/en/q/Vaud%20(Canton)/imprimerie.html?page=" + str(page)
get_url = requests.get(url)
get_text = get_url.text
#take the text of the request
soup = BeautifulSoup(get_text, "html.parser")
for link in soup.find_all('a', {'class': "details-entry-title-link"}):
href = link.get('href')
bot_get_data(href)
page += 1
def bot_get_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
name_company = soup.find_all("h1")
name_data = []
for item in name_company:
name_data.append(item.string)
print(item.string) #text or string ? don't know the diff
excel_data_transfer(name_data)
def excel_data_transfer(dataname):
workbook = xlsxwriter.Workbook('datasccraping3.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write_column('A1', dataname)
workbook.close()

Why does my crawler with BeautifulSoup, not show results?

This is the code that I wrote.
import requests
from bs4 import BeautifulSoup
def code_search(max_pages):
page = 1
while page <= max_pages:
url = 'http://kindai.ndl.go.jp/search/searchResult?searchWord=朝鲜&facetOpenedNodeIds=&featureCode=&viewRestrictedList=&pageNo=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
page += 1
code_search(2)
My pycharm version is pycharm-community-5.0.3 for mac.
It just says:
"Process finished with exit code 0"
But there should be some results if I have wrote the code accordingly...
Please help me out here!
You have no print statements - so the program doesn't output anything.
Add some print statements. For example, if you output the link, do this:
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
print(href)
page += 1
The answer depends on what you want to archieve with the web crawler. The first observation is that nothing is printed.
The following code prints the URL and all links found on the URL.
import requests
from bs4 import BeautifulSoup
def code_search(max_pages):
page = 1
while page <= max_pages:
url = 'http://kindai.ndl.go.jp/search/searchResult?searchWord=朝鲜&facetOpenedNodeIds=&featureCode=&viewRestrictedList=&pageNo=' + str(page)
print("Current URL:", url)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
print("Found URL:", href)
page += 1
code_search(2)
It is also possible to let the method return all found URLs and then print the results:
import requests
from bs4 import BeautifulSoup
def code_search(max_pages):
page = 1
urls = []
while page <= max_pages:
url = 'http://kindai.ndl.go.jp/search/searchResult?searchWord=朝鲜&facetOpenedNodeIds=&featureCode=&viewRestrictedList=&pageNo=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'item-link'}):
href = link.get('href')
urls.append(href)
page += 1
return urls
print("Found URLs:", code_search(2))

Categories

Resources