Parse csv and append if no match found - python

please help
I want to check csv first on "final link" below in code and append csv only if there is no match found:
Thanks in advance
for i in range(0,1):
with open('plaid.csv', 'a') as f_object:
writer_object = csv.writer(f_object)
url = f'https://plaidonline.com/products?closeout=True&page={i}'
r = requests.get(url=url, headers=headers).content
soup = BeautifulSoup(r, 'lxml')
product_block = soup.select('div', class_='col-xs-12 col-md-8 col-lg-9 ')
for i in product_block:
href = i.find_all(class_='tile-link', href=True)
for link in href:
link = link.get('href')
final_link = 'https://plaidonline.com/'+link
if final_link not in product_urls:
product_urls.append(final_link)
# print(final_link)
writer_object.writerow(final_link)
f_object.close()

Related

Why do I get output from one but not the other?

from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
def correct_url(url1):
if not url1.startswith('https://www.parliament.gov.sg'):
url1 = f'https://www.parliament.gov.sg{url1}'
return url1
url_to_scrape = 'https://www.parliament.gov.sg/mps/list-of-current-mps'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
for link in links:
url = link
member_info = 'mp-designation-wrap'
**member_info = 'mp-constituency-wrap'**
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt1 = soup.find('div', attrs={'class': member_info})
textoutput = txt1.text
print(textoutput)
break
I'm trying to separate the different categories to use save separately, however, I only get output when using the member_info = 'mp-designation-wrap' and I get a AttributeError: 'NoneType' object has no attribute 'text' when using 'mp-constituency-wrap'.
I do not understand why it is giving me different results and it would be great if someone could help me understand why it is so and point me in the right direction
Reason why you get this error is, that the element you try to select do not exist in some of your resources, so you have to check that before calling .text.
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
text1 = e.text if (e := soup.find('div', attrs={'class': 'mp-designation-wrap'})) else None
text2 = e.text if (e := soup.find('div', attrs={'class': 'mp-constituency-wrap'})) else None
print(text2)

Extract href attribute url from anchor

I can't extract the href attribute of anchors from page.. I tried using re library:
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
But it doesn't work, i get the error:
table_rows = soup.find('table').find_all('tr')[1:]
AttributeError: 'NoneType' object has no attribute 'find_all'
Can you help me understand better on how exactly to extract them ?
Thanks in advance.
Edit:
Full code:
import requests
from bs4 import BeautifulSoup
import re
DOMAIN_NAME = "https://www.dllr.state.md.us/employment"
BASE_URL = DOMAIN_NAME + '/warn2010.shtml'
def parse_url(url):
html_source = requests.get(url, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
data = []
table_rows = soup.find('table').find_all('tr')[1:]
for table_row in table_rows:
table_data = table_row.find_all('td')
data.append({
'notice_date': table_data[0].text,
'naics_code': table_data[1].text,
'company': table_data[2].text,
'location': table_data[3].text,
'wia_code': table_data[4].text,
'total_employees': table_data[5].text,
'effective_date': table_data[6].text,
'type_code': table_data[7].text
})
return data
def run_ingestion():
html_source = requests.get(BASE_URL, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print(link.get('href'))
url = DOMAIN_NAME + '/' + link.get('href')
data = parse_url(url)
for row in data:
print(row)
if __name__ == '__main__':
run_ingestion()
Following your code, you should try this:
soup = BeautifulSoup(html_source, 'html.parser')
tag = soup.findAll('a', attrs={'href': re.compile("^http://")})
links = [i["href"] for i in tag]
As Beautiful Soup documentations says:
If find() can’t find anything, it returns None
That means, your soup.find('table') cannot be found.
I would go with a more succinct list comprehension where you use an attribute = value selector with starts with ^ operator
links = [link['href'] for link in soup.select("a[href^='http:']")]

python beautiful soup output into excel

I am trying to get the output from the python script into excel. The script works fine in Python, but when I try and do the import CSV and writerow rule it doesn't work. It says price not defined in writerow and how would I print multiple items. Any help would be appreciated.
import csv
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline = "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/manchester/?identifier=manchester&q=manchester&search_source=home&radius=0&pn=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
print(item_name.get_text(strip=True))
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
print(item_fame.get_text(strip=True))
writer.writerow(price)
trade_spider(1)
The object price is not defined anywhere in your script outside of the function get_single_item_data. Outside of that function your code cannot recognize any object with that name. Also, get_single_item_data does not return anything from the BeautifulSoup object. It only prints it. You should rewrite your function to be something like this:
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
#create list to contain addresses
addresses = []
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
#add each address to the list
addresses.append(address)
print(item_name.get_text(strip=True))
#create list for prices
prices = []
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
#add prices to list
prices.append(price)
print(item_fame.get_text(strip=True))
#alter the code to return the data structure you prefer.
return([addresses,prices])

Beautifulsoup append html data

``Could someone tell me how to append all the datas in my var ?
name_company = soup.find_all("h1")
name_data = []
for item in name_company:
name_data.append(item.string)
why when I print(name_data), there is only the last h1 who get scrap in my tuple ?
Thanks !
EDIT :
Here's my simplified code :
def robot_crawl(max_pages):
page = 1
while page < max_pages:
url = "http://tel.local.ch/en/q/Vaud%20(Canton)/imprimerie.html?page=" + str(page)
get_url = requests.get(url)
get_text = get_url.text
#take the text of the request
soup = BeautifulSoup(get_text, "html.parser")
for link in soup.find_all('a', {'class': "details-entry-title-link"}):
href = link.get('href')
bot_get_data(href)
page += 1
def bot_get_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
name_company = soup.find_all("h1")
name_data = []
for item in name_company:
name_data.append(item.string)
print(item.string) #text or string ? don't know the diff
excel_data_transfer(name_data)
def excel_data_transfer(dataname):
workbook = xlsxwriter.Workbook('datasccraping3.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write_column('A1', dataname)
workbook.close()

Python web scraping using BeautifulSoup, Loop and skip certain URL value

So I am using following code to scrape statues from a site.
from bs4 import BeautifulSoup
import requests
f = open('C:\Python27\projects\FL_final.doc','w')
base_url = "http://www.leg.state.fl.us/statutes/index.cfm?App_mode=Display_Statute&URL=0000-0099/00{chapter:02d}/00{chapter:02d}.html"
for chapter in range (1,9):
url = base_url.format(chapter=chapter)
r = requests.get(url)
soup = BeautifulSoup((r.content),"html.parser")
tableContents = soup.find('div', {'class': 'Chapters' })
for title in tableContents.find_all ('div', {'class': 'Title' }):
f.write (title.text)
for data in tableContents.find_all('div',{'class':'Section' }):
data = data.text.encode("utf-8","ignore")
data = "\n\n" + str(data)+ "\n"
f.write(data)
f.close()
the problem is that certain chapters are missing. For example, there are pages for chapter 1 to chapter 2, then page for chapter 3,4,5 doesn't exist. So when use range (1,9) it gives me errors as it cant pick up chapter 3,4,5 contents, as their (0003/0003, 0004/0004, 0005/0005)url dont exist.
How can I skip missing URLs in my loop and let the program find the next available URL within the range?
here is chapter 1's url: http://www.leg.state.fl.us/statutes/index.cfm?App_mode=Display_Statute&URL=0000-0099/0001/0001.html
You can add a try for the url request and check that tableContents is not none before applying your find_all :
import requests
f = open('C:\Python27\projects\FL_final.doc','w')
base_url = "http://www.leg.state.fl.us/statutes/index.cfm?App_mode=Display_Statute&URL=0000-0099/00{chapter:02d}/00{chapter:02d}.html"
for chapter in range (1,9):
url = base_url.format(chapter=chapter)
try:
r = requests.get(url)
except requests.exceptions.RequestException as e: # This is the correct syntax
print "missing url"
print e
sys.exit(1)
soup = BeautifulSoup((r.content),"html.parser")
tableContents = soup.find('div', {'class': 'Chapters' })
if tableContents is not None:
for title in tableContents.find_all ('div', {'class': 'Title' }):
f.write (title.text)
for data in tableContents.find_all('div',{'class':'Section' }):
data = data.text.encode("utf-8","ignore")
data = "\n\n" + str(data)+ "\n"
print data
f.write(data)
you can check if tableContents is found, e.g.:
tableContents = soup.find('div', {'class': 'Chapters' })
if tableContents:
for title in tableContents.find_all ('div', {'class': 'Title' }):
f.write (title.text)

Categories

Resources