python webscraping and write data into csv - python

I'm trying to save all the data(i.e all pages) in single csv file but this code only save the final page data.Eg Here url[] contains 2 urls. the final csv only contains the 2nd url data.
I'm clearly doing something wrong in the loop.but i dont know what.
And also this page contains 100 data points. But this code only write first 44 rows.
please help this issue.............
from bs4 import BeautifulSoup
import requests
import csv
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
for ur in url:
r = requests.get(ur)
soup = BeautifulSoup(r.content)
g_data = soup.find_all("a", {"class": "hdrlnk"})
gen_list=[]
for row in g_data:
try:
name = row.text
except:
name=''
try:
link = "http://sfbay.craigslist.org"+row.get("href")
except:
link=''
gen=[name,link]
gen_list.append(gen)
with open ('filename2.csv','wb') as file:
writer=csv.writer(file)
for row in gen_list:
writer.writerow(row)

the gen_list is being initialized again inside your loop that runs over the urls.
gen_list=[]
Move this line outside the for loop.
...
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
gen_list=[]
for ur in url:
...

i found your post later, wanna try this method:
import requests
from bs4 import BeautifulSoup
import csv
final_data = []
url = "https://sfbay.craigslist.org/search/sss"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="result-row")
for details in get_details:
getclass = details.find_all(class_="hdrlnk")
for link in getclass:
link1 = link.get("href")
sublist = []
sublist.append(link1)
final_data.append(sublist)
print(final_data)
filename = "sfbay.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter = ",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])

Related

After scraping I can not write the text to a text file

I am trying to scrape the prices from a website and it's working but... I can't write the result to a text.file.
this is my python code.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
price = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
name =("example")
f =open(name + '.txt', "a")
f.write(price.text)
This is not working but if I print it instead of try to write it to a textfile it's working. I have searched for a long time but don't understand it. I think it must be a string to write to a text file but don't know how to change the ouput to a string.
You're getting error due to unicode character.
Try to add encoding='utf-8' property while opening a file.
Also your code gives a bit messy output. Try this instead:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
rows = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
prices = rows.findAll("span",{"class":"price-holder-row"})
names = rows.findAll("div",{"class":"name-holder"})
price_list = []
name_list = []
for price in prices:
price_list.append(price.text.strip("\n "))
for name in names:
name_list.append(name.text.split()[0])
name =("example")
with open(f"{name}.txt",mode='w', encoding='utf-8') as f:
for name, price in zip(name_list,price_list):
f.write(f"{name}:{price}\n")

Need help in scraping information from multiple webpages and import to csv file in tabular form - Python

I have been working on webscraping the infobox information on Wikipedia. This is the following code that I have been using:
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union','https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union','https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help','https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union','https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
for url in URL:
headers=[]
rows=[]
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',class_ ='infobox')
credit_union_name= soup.find('h1', id = "firstHeading")
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
if len(beautified_value) == 0:
continue
rows.append(beautified_value)
rows.append("")
rows.append([credit_union_name.text.strip()])
rows.append([url])
with open(r'credit_unions.csv','a+',newline="") as output:
writer=csv.writer(output)
writer.writerow(headers)
writer.writerow(rows)
However, I checked the csv file and information is not being presented in tabular form. The scraped elements are being stored in nested lists instead of a singular list. I need the scraped information of each URL to be stored in a singular list and print the list in csv file in tabular form with the headings. Need help regarding this.
The infoboxes have different structures and labels. So I think the best way to solve this is to use dicts and a DictWriter.
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union',
'https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help',
'https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union',
'https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
csv_headers = set()
csv_rows = []
for url in URL:
csv_row = {}
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
credit_union_name = soup.find('h1', id="firstHeading")
table = soup.find('table', class_='infobox')
data_rows = table.find_all('tr')
for data_row in data_rows:
label = data_row.find('th')
value = data_row.find('td')
if label is None or value is None:
continue
beautified_label = label.text.strip()
beautified_value = value.text.strip()
csv_row[beautified_label] = beautified_value
csv_headers.add(beautified_label)
csv_row["name"] = credit_union_name.text.strip()
csv_row["url"] = url
csv_rows.append(csv_row)
with open(r'credit_unions.csv', 'a+', newline="") as output:
headers = ["name", "url"]
headers += sorted(csv_headers)
writer = csv.DictWriter(output, fieldnames=headers)
writer.writeheader()
writer.writerows(csv_rows)

Web scraping soup.findAll always return empty list

I try to scrape web page using python and BeautifulSoup. When I write:
table = soup.find('table')
it returns None.
and when I try get row content, it always returns empty list.
I also used Selenium and the same result empty list.
import requests
from bs4 import BeautifulSoup
import csv
url = "https://www.iea.org/data-and-statistics/data-tables?country=CANADA&energy=Balances&year=2010"
response = requests.get(url)
print(response.status_code) >>> print 200
soup = BeautifulSoup(response.text,"html.parser")
tr = soup.findAll('tr', attrs={'class': 'm-data-table__row '})
print(tr) >>> print []
print(len(tr)) >>> print 0
csvFile = open("C:/Users/User/Desktop/test27.csv",'wt',newline='', encoding='utf-8')
writer = csv.writer(csvFile)
try:
for cell in tr:
td = cell.find_all('td')
row = [i.text.replace('\n','') for i in td]
writer.writerow(row)
finally:
csvFile.close()
Any help?
When you analyse the website, the data is loaded via ajax call. The following script makes the ajax call and saves the required json to a file
import requests, json
from bs4 import BeautifulSoup
res = requests.get("https://api.iea.org/stats/?year=2010&countries=CANADA&series=BALANCES")
data = res.json()
with open("data.json", "w") as f:
json.dump(data,f)

Writing to scraped links to a CSV file using Python3

I have scraped a website for html links and have a result of about 500 links. When I try to write them to a csv file, I do not get the list only the base page.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
writer.writerow([web_links])
csvfile.close()
I only get two lines in my csv file. The header 'Links' and www.census.gov. I have tried making it different by add another for loop in the csv writer area, but I get similar results.
for link in soup.find_all('a'):
web_links = link.get('href')
abs_url = join(page, web_links)
print(abs_url)
if abs_url and abs_url not in link_set:
writer.write(str(abs_url) + "\n")
link_set.add(abs_url)
It seems the 'web_links' definition should be where I put all the links into the csv file, but no dice. Where am I making my mistake?
In your code, you are writing two row in csv i.e.
writer.writerow(['Links'])
writer.writerow([web_links])
Here web_links is the last instance of retrieved href value.
I don't see the use of set instance. You can print and write in the csv without using set instance in following way :
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in soup.find_all('a'):
web_links = link.get("href")
if web_links:
print(web_links)
writer.writerow([web_links])
csvfile.close()
You have never added the scrapped links to your set():
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
link_set.add(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in link_set:
writer.writerow([link])
csvfile.close()

writing only one row after website scraping

I am trying to extract a list of all the golf courses in the USA through this link. I need to extract the name of the golf course, address, and the phone number. My script is suppose to extract all the data from the website but it looks like it only prints one row in my csv file. I noticed that when I print the "name" field it only prints once despite the find_all function. All I need is the data and not just one field from multiple links on the website.
How do I go about fixing my script so that it prints all the needed data into a CSV file.
Here is my script:
import csv
import requests
from bs4 import BeautifulSoup
courses_list = []
for i in range(1):
url="http://www.thegolfcourses.net/page/1?ls&location=California&orderby=title&radius=6750#038;location=California&orderby=title&radius=6750" #.format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"list"})
for item in g_data2:
try:
name= item.contents[7].find_all("a",{"class":"entry-title"})[0].text
print name
except:
name=''
try:
phone= item.contents[7].find_all("p",{"class":"listing-phone"})[0].text
except:
phone=''
try:
address= item.contents[7].find_all("p",{"class":"listing-address"})[0].text
except:
address=''
course=[name,phone,address]
courses_list.append(course)
with open ('PGN_Final.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow([s.encode("utf-8") for s in row])
Here is a neat implementation for your code. You can use the library urllib2 instead of requests. And bs4 works the same though.
import csv
import urllib2
from BeautifulSoup import *
url="http://www.thegolfcourses.net/page/1?ls&location=California&orderby=title&radius=6750#038;location=California&orderby=title&radius=6750" #.format(i)
r = urllib2.urlopen(url).read()
soup = BeautifulSoup(r)
courses_list = []
courses_list.append(("Course name","Phone Number","Address"))
names = soup.findAll('h2', attrs={'class':'entry-title'})
phones = soup.findAll('p', attrs={'class':'listing-phone'})
address = soup.findAll('p', attrs={'class':'listing-address'})
for na, ph, add in zip(names,phones, address):
courses_list.append((na.text,ph.text,add.text))
with open ('PGN_Final.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow([s.encode("utf-8") for s in row])

Categories

Resources