i am a newbie in coding programs, and i start with python. i use it for scraping data from website, online shops to be specific. i want to scrape every page of the result page (with pagination) and put the result url in one csv
this is what i've been trying
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
for number2 in range(10):
filename = "tokopedia" + str(number2) + ".csv"
f = open(filename, "w")
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()
the result that i got in the csv is only for one page. can you guys help me?
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
filename = "tokopedia.csv"
f = open(filename, "w")
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()
Related
I am trying to download all netcdf (.nc) files here:
https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/2000/
import urllib3
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('.nc')
However, list_urls is empty after running this code. How can I fix it?
Here is what I did soup.find_all(text=lambda t: ".nc" in t) and working fine with a progress bar as well :)
import sys
import requests
import urllib3
import humanize
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
link_urls = soup.find_all(text=lambda t: ".nc" in t)
for link in link_urls:
download_link = "{}2000/{}".format(base_url, link)
r = requests.get(download_link, stream=True)
total_length = r.headers.get('content-length')
print("\nDownloading: {}\nTotalSize: {}".format(download_link, humanize.naturalsize(total_length)))
with open(link, "wb") as f:
print("Downloading %s" % link)
if total_length is None: # no content length header
f.write(r.content)
else:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done)))
sys.stdout.flush()
I am trying to scrape a website to get the info and output it to a CSV file. For the data I am trying to extract, there is an output to the terminal but I need that to be in a CSV file.
I have tried several different methods but cannot find a solution.
The CSV file is created but it's just empty. There is probably something really simple.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import time
from bs4 import BeautifulSoup
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
options = Options()
options.page_load_strategy = 'normal'
# Navigate to url
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
options.add_argument("--window-size=1920x1080")
results = driver.find_element_by_class_name('program1_content_container')
soup = BeautifulSoup(results.text, 'html.parser')
# results = driver.find_element_by_class_name('program1_content_container')
p_data1 = soup.find_all("div", {"class_name": "program1_content_container"})
p_data2 = soup.find_all("div", {"class_name": "program_time"})
p_data3 = soup.find_all("div", {"class_name": "sport"})
p_data4 = soup.find_all("div", {"class": "program_text"})
print("Here is your data, I am off ot sleep now see ya ")
print(results.text)
# Create csv
programme_list = []
# Programme List
for item in p_data1:
try:
name = item.contents[1].find_all(
"div", {"class": "program1_content_container"})[0].text
except:
name = ''
p_data1 = [time]
programme_list.append(p_data1)
# Programme Time
for item in p_data2:
try:
time = item.contents[1].find_all(
"div", {"class": "program_time"})[0].text
except:
time = ''
p_data2 = [time]
programme_list.append(p_data2)
# Which sport
for item in p_data3:
try:
time = item.contents[1].find_all(
"div", {"class": "sport"})[0].text
except:
time = ''
p_data3 = [time]
programme_list.append(p_data3)
with open('sport.csv', 'w') as file:
writer = csv.writer(file)
for row in programme_list:
writer.writerow(row)
I have just tried to add an object called data_output Then I tried to print the data_output
data_output = [p_data1, p_data2, p_data3, p_data4]
...
print(data_output)
The output in the terminal is
Load data into pandas dataframe and export into csv.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
results =WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,".program1_content_container")))
soup = BeautifulSoup(results.get_attribute("outerHTML"), 'html.parser')
program_time=[]
sport=[]
program_text=[]
program_info=[]
for item in soup.select(".program_details "):
if item.find_next(class_='program_time'):
program_time.append(item.find_next(class_='program_time').text.strip())
else:
program_time.append("Nan")
if item.find_next(class_='sport'):
sport.append(item.find_next(class_='sport').text.strip())
else:
sport.append("Nan")
if item.find_next(class_='program_text'):
program_text.append(item.find_next(class_='program_text').text.strip())
else:
program_text.append("Nan")
if item.find_next(class_='program_info'):
program_info.append(item.find_next(class_='program_info').text.strip())
else:
program_info.append("Nan")
df=pd.DataFrame({"program_time":program_time,"sport":sport,"program_text":program_text,"program_info":program_info})
print(df)
df.to_csv("sport.csv")
csv snapshot after creation
If you don't have pandas then you need to install it.
pip install pandas
As Blue Fishy said you can try to change to w mode only, but you may run in an encoding error.
Solution that works on your data
import csv
programme_list = ['19:55','MOTORU SPORTS','Motoru sporta "5 minūte"','Iknedēļas Alda Putniņa veidots apskats par motoru sportu','20:00','BASKETBOLS','...']
with open('sport.csv', 'w', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=',', lineterminator='\n')
for row in programme_list:
print(row)
writer.writerow([row])
Output
19:55
MOTORU SPORTS
"Motoru sporta ""5 minūte"""
Iknedēļas Alda Putniņa veidots apskats par motoru sportu
20:00
BASKETBOLS
...
Instead of writing binary, can you try changing wb to w?
Change
with open('sport.csv', 'wb') as file:
to
with open('sport.csv', 'w') as file:
EDITED:
Sorry for being a bit late. Here is the code modified based on your original code FYI.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import time
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
options = Options()
options.page_load_strategy = 'normal'
# Navigate to url
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
options.add_argument("--window-size=1920x1080")
results = driver.find_element_by_class_name('program1_content_container')
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
# results = driver.find_element_by_class_name('program1_content_container')
p_data1 = soup.find_all("p", {"class": "program_info"})
p_data2 = soup.find_all("p", {"class": "program_time"})
p_data3 = soup.find_all("p", {"class": "sport"})
p_data4 = soup.find_all("p", {"class": "program_text"})
# Create csv
programme_list = []
# Programme List
for i in range(len(p_data1)):
programme_list.append([p_data1[i].text.strip(), p_data2[i].text.strip(), p_data3[i].text.strip(), p_data4[i].text.strip()])
with open('sport.csv', 'w', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["program_info", "program_time", "sport", "program_text"])
for row in programme_list:
writer.writerow(row)
Excel Screenshot here
I am trying to download multiple .csv files from an url directory with similar names (AB_daily.csv, BC_daily.csv, etc.). However, each file is stored in different folders in the directory. I know there is a way to use a loop to extract the files, but I can't figure out how to do it with Beautiful Soup or glob. Do you have any suggestions? I've also used pandas.read_csv() to look for shortcuts as I'm just trying to concatenate the files together later. Thank you.
URL Directory: https://dd.weather.gc.ca/hydrometric/csv/
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:20]}{item['href']}" for item in soup.select(
"a[href$='AB_daily_hydrometric.csv']")
for x in target:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv')
# For this specific task this will work:
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
csv_links = []
links = ["AB/", "BC/", "MB/", "NB/", "NL/", "NS/", "NT/", "NU/" "ON/", " PE/", "QC/", "SK/", "YT/"]
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for item in links:
r_daily = BeautifulSoup(req.get(url + item + "daily/").content, 'html.parser')
r_hourly = BeautifulSoup(req.get(url + item + "hourly/").content, 'html.parser')
for item_d in r_daily.find_all('a'):
if ".csv" in item_d.get('href'):
csv_links.append(url + item + "daily/" +item_d.get('href'))
for item_h in r_hourly.find_all('a'):
if ".csv" in item_h.get('href'):
csv_links.append(url + item + "hourly/" + item_h.get('href'))
for x in csv_links:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv/')
I am using the following code:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
years = list(range(1956,2016))
for year in years:
my_urls = ('http://www.hitparadeitalia.it/hp_yends/hpe' + str(year) + '.htm',)
my_url = my_urls[0]
for my_url in my_urls:
uClient = uReq(my_url)
html_input = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html_input, "html.parser")
container = page_soup.findAll("li")
filename = "singoli" + str(year) + ".csv"
f = open(filename, "w")
headers = "lista"
f.write(headers)
lista = container[0].text
print("lista: " + lista)
f.write(lista + "\n")
f.close()
I get text that does not seem to be in the "li" container but it gets written in the outputs. This is the unwanted text:
<!--
google_ad_client = "ca-pub-9635531430093553";
/* in medias res */
google_ad_slot = "9880694813";
google_ad_width = 468;
google_ad_height = 60;
//-->
How can I get rid of it?
The text you don't want is coming from a script element. So get rid of the script elements before you start and it works:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
years = list(range(1956,2016))
for year in years:
my_urls = ('http://www.hitparadeitalia.it/hp_yends/hpe' + str(year) + '.htm',)
my_url = my_urls[0]
for my_url in my_urls:
uClient = uReq(my_url)
html_input = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html_input, "html.parser")
[s.extract() for s in page_soup('script')]
container = page_soup.findAll("li")
filename = "singoli" + str(year) + ".csv"
f = open(filename, "w")
headers = "lista"
f.write(headers)
lista = container[0].text
print("lista: " + lista)
f.write(lista + "\n")
f.close()
All I did was to add the one line:
[s.extract() for s in page_soup('script')]
Which finds the script elements and removes them.
I wrote this code on Python 2.7.13, for scraping datatable from a website.
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+","+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem,Devizanev,Egyseg,Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()
I want to export this into CSV with the next 4 rows:
"Penznem Devaizanev Egyseg Penznemforintban"
The data are separated with "," but the last two values is ONE row. (283,45)
How can I fix it?
you can not avoid last coma directly but,
What you can simply do is to use another seprator i.e. ;(semicolon)
and when you open file in exel,calc select (;)semicolon as seprator and you will get result as expected!
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+";"+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem;Devizanev;Egyseg;Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()