I have scraped a website for html links and have a result of about 500 links. When I try to write them to a csv file, I do not get the list only the base page.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
writer.writerow([web_links])
csvfile.close()
I only get two lines in my csv file. The header 'Links' and www.census.gov. I have tried making it different by add another for loop in the csv writer area, but I get similar results.
for link in soup.find_all('a'):
web_links = link.get('href')
abs_url = join(page, web_links)
print(abs_url)
if abs_url and abs_url not in link_set:
writer.write(str(abs_url) + "\n")
link_set.add(abs_url)
It seems the 'web_links' definition should be where I put all the links into the csv file, but no dice. Where am I making my mistake?
In your code, you are writing two row in csv i.e.
writer.writerow(['Links'])
writer.writerow([web_links])
Here web_links is the last instance of retrieved href value.
I don't see the use of set instance. You can print and write in the csv without using set instance in following way :
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in soup.find_all('a'):
web_links = link.get("href")
if web_links:
print(web_links)
writer.writerow([web_links])
csvfile.close()
You have never added the scrapped links to your set():
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
link_set.add(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in link_set:
writer.writerow([link])
csvfile.close()
Related
I have a 45k+ rows CSV file, each one containing a different path of the same domain - which are structurally identical to each other - and every single one is clickable. I managed to use BeautifulSoup to scrape the title and content of each one and through the print function, I was able to validate the scraper. However, when I try to export the information gathered to a new CSV file, I only get the last URL's street name and description, and not all of them as I expected.
from bs4 import BeautifulSoup
import requests
import csv
with open('URLs.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
site = requests.get(row['addresses']).text
soup = BeautifulSoup(site, 'lxml')
StreetName = soup.find('div', class_='hist-title').text
Description = soup.find('div', class_='hist-content').text
with open('OutputList.csv','w', newline='') as output:
Header = ['StreetName', 'Description']
writer = csv.DictWriter(output, fieldnames=Header)
writer.writeheader()
writer.writerow({'StreetName' : StreetName, 'Description' : Description})
How can the output CSV have on each row the street name and description for the respective URL row in the input CSV file?
You need to open both files on the same level and then read and write on each iteration. Something like this:
from bs4 import BeautifulSoup
import requests
import csv
with open('URLs.csv') as a, open('OutputList.csv', 'w') as b:
reader = csv.reader(a)
writer = csv.writer(b, quoting=csv.QUOTE_ALL)
writer.writerow(['StreetName', 'Description'])
# Assuming url is the first field in the CSV
for url, *_ in reader:
r = requests.get(url)
if r.ok:
soup = BeautifulSoup(r.text, 'lxml')
street_name = soup.find('div', class_='hist-title').text.strip()
description = soup.find('div', class_='hist-content').text.strip()
writer.writerow([street_name, description])
I hope it helps.
I am trying to webscrape soccer team-names and odds from a webpage to a csv file. My problem is that it's only writing the first two strings into a csv-file. Can anyone see what I'm doing wrong?
When I print it it works perfectly as seen on the picture but when I extract it to a csv file it's empty
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
url = "https://1xbet.cm/en/live/Football/"
driver = webdriver.Chrome("C:/Users/Christian/Desktop/WebScraper 0.5/chromedriver/chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
containers = soup.findAll("div", {"class": "c-events__item_col"})
for container in containers:
teams = [x.get_text() for x in container.findAll(
"span", {"class": "c-events__team"}
)]
odds = [x.attrs.get('data-coef') for x in container.findAll(
"a", {"class": "c-bets__bet"}
)]
#print(teams)
#print(odds)
#print()
# name of csv file
filename = "C:/Users/Christian/Desktop/WebScraper 0.5/1xbetLiveOdds.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(teams)
csvwriter.writerow(odds)
It seems that you want something like this:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
url = "https://1xbet.cm/en/live/Football/"
driver = webdriver.Chrome("C:/Users/Christian/Desktop/WebScraper 0.5/chromedriver/chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
containers = soup.findAll("div", {"class": "c-events__item_col"})
teams_odds = []
for container in containers:
teams_odds.append([x.get_text() for x in container.findAll(
"span", {"class": "c-events__team"}
)])
teams_odds.append([x.attrs.get('data-coef') for x in container.findAll(
"a", {"class": "c-bets__bet"}
)])
# name of csv file
filename = "C:/Users/Christian/Desktop/WebScraper 0.5/1xbetLiveOdds.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
for toe in teams_odds:
csvwriter.writerow(toe)
I am very new to python and BeautifulSoup and I am trying to use it to scrape multiple urls at the same time using a loop. The loop will consist of locating the banner slide on the home page of each website and get the len of how many banners that website has and place them into an excel file next to the corresponding url. I have a list of urls saved in a csv file and basically what I want to do is take each of those urls and run the loop, pulling the number of banners, and put that number next to the url into a separate column in excel.
This is the code I have so far and all it does for me is write the urls back into a csv file and gives me the number of banners for only the last url.
from bs4 import BeautifulSoup
import requests
with open("urls.csv", "r") as f:
csv_raw_cont=f.read()
split_csv=csv_raw_cont.split('\n')
split_csv.remove('')
separator=';'
filename = "DDC_number_of_banners.csv"
f = open(filename, "w")
headers = "url, Number_of_Banners\n"
f.write(headers)
for each in split_csv:
url_row_index=0
url = each.split(separator)[url_row_index]
html = requests.get(url).content
soup= BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link',
'html-slide slide has-link']})
Number_of_banners = len(banner_info)
f.write(csv_raw_cont + "," + str(Number_of_banners) + "," + "\n")
f.close()
Making use of Python's CSV library would make this a bit simpler:
from bs4 import BeautifulSoup
import requests
import csv
with open("urls.csv", "r") as f_urls, open("DDC_number_of_banners.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['url', 'Number_of_banners'])
for url in f_urls:
url = url.strip()
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link', 'html-slide slide has-link']})
csv_output.writerow([url, len(banner_info)])
To include information such as each banner's data-label:
from bs4 import BeautifulSoup
import requests
import csv
with open("urls.csv", "r") as f_urls, open("DDC_number_of_banners.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['url', 'Number_of_banners', 'data_labels'])
for url in f_urls:
url = url.strip()
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link', 'html-slide slide has-link']})
data_labels = [banner.get('data-label') for banner in banner_info]
csv_output.writerow([url, len(banner_info)] + data_labels)
So i have a working code that pulls data from 30 websites on a domain.
with open("c:\source\list.csv") as f:
for row in csv.reader(f):
for url in row:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
tables = soup.find('table', attrs={"class": "hpui-standardHrGrid-table"})
for rows in tables.find_all('tr', {'releasetype': 'Current_Releases'})[0::1]:
item = []
for val in rows.find_all('td'):
item.append(val.text.strip())
with open('c:\source\output_file.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow({url})
writer.writerows(item)
When I open the CSV file, I see each character taken from the 'Item' variable is stored in its own cell. I can't seem to find out what the heck is doing this and how to fix it.
Any thoughts?
I fixed this by changing
writer.writerows(item)
to
writer.writerow(item)
I'm trying to save all the data(i.e all pages) in single csv file but this code only save the final page data.Eg Here url[] contains 2 urls. the final csv only contains the 2nd url data.
I'm clearly doing something wrong in the loop.but i dont know what.
And also this page contains 100 data points. But this code only write first 44 rows.
please help this issue.............
from bs4 import BeautifulSoup
import requests
import csv
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
for ur in url:
r = requests.get(ur)
soup = BeautifulSoup(r.content)
g_data = soup.find_all("a", {"class": "hdrlnk"})
gen_list=[]
for row in g_data:
try:
name = row.text
except:
name=''
try:
link = "http://sfbay.craigslist.org"+row.get("href")
except:
link=''
gen=[name,link]
gen_list.append(gen)
with open ('filename2.csv','wb') as file:
writer=csv.writer(file)
for row in gen_list:
writer.writerow(row)
the gen_list is being initialized again inside your loop that runs over the urls.
gen_list=[]
Move this line outside the for loop.
...
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
gen_list=[]
for ur in url:
...
i found your post later, wanna try this method:
import requests
from bs4 import BeautifulSoup
import csv
final_data = []
url = "https://sfbay.craigslist.org/search/sss"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="result-row")
for details in get_details:
getclass = details.find_all(class_="hdrlnk")
for link in getclass:
link1 = link.get("href")
sublist = []
sublist.append(link1)
final_data.append(sublist)
print(final_data)
filename = "sfbay.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter = ",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])