Im making a code that scrapes a website for lyrics on a bunch of artists and saves the lyrics as .txt files in a directories named after their respective albums.
But after my program has finished the first artist, it keeps looping the same artist. Why?
Code:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/"+str(artist)):
os.mkdir("D:/Folder/"+str(artist))
link=urlhome+str(artist[0])+"/"+artist.replace(" ","+")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0])+"/"+artist.replace(" ","+") in album["href"]:
albumurl = "https://www.lyricsfreak.com"+album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip()+" "+albumyear
if not os.path.exists("D:/Folder/"+str(artist)+"/"+albumname):
os.mkdir("D:/Folder/"+str(artist)+"/"+albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com"+song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist+"/"+albumname+"/"+(songname)+".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] + '\n')
print("parsing "+str(songname))
The block of code after:
if str(artist[0])+"/"+artist.replace(" ","+") in album["href"]:
albumurl = "https://www.lyricsfreak.com"+album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip()+" "+albumyear
Needs to be indented to be included with in that condition statement. Other wise, it just skips that little block and then just repeats everything on the last albumurl stored string.
Full Code:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/"+str(artist)):
os.mkdir("D:/Folder/"+str(artist))
link=urlhome+str(artist[0])+"/"+artist.replace(" ","+")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0])+"/"+artist.replace(" ","+") in album["href"]:
albumurl = "https://www.lyricsfreak.com"+album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip()+" "+albumyear
if not os.path.exists("D:/Folder/"+str(artist)+"/"+albumname): #<-- INDENT REST OF CODE
os.mkdir("D:/Folder/"+str(artist)+"/"+albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com"+song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist+"/"+albumname+"/"+(songname)+".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] + '\n')
print("parsing "+str(songname))
Related
as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)
I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()
I have tried many times, but it does not work:
import requests
from lxml import html, etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
i = int(1)
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click()
time.sleep(2)
# for k, v in jd_goods_data.items():
# print(k,v)
I am trying to download some details, but it doesn't work. If you type 2 to scan, it only downloads one page details, but twice!
Ok, you define q but you do not actually use it as such. In this case, the convention is to name this unused variable as _. I mean, instead of doing
for q in range(page_num):
you should do
for _ in range(page_num):
Thus, other programers will directly know that you do not use q, and only want your operation to be repeated.
Which means that (for some reasons) the line driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click() does not execute correctly. For sure there is a way to make it work. But in your case, I heuristically see that your url contains a parameter whose name is page. I recommend you to use it instead. Which thus leads to actually use the variable q as such., as follows:
import requests
from lxml import html,etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
driver = webdriver.Chrome()
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
i = 1
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
driver.quit()
I have set my windows power settings such that my computer never turns off and never goes to sleep. I've setup the following python script to run as a scheduled task, which runs fine until almost 1/2 hour after I've logged off my computer. It mysteriously stops. There is no error message in the Events log. The memory used by this process doesn't appear to be spiking. I don't know what's going on. Please help!
import datetime
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import logging
import shutil
def get_soup(theurl):
html = urlopen(theurl)
return BeautifulSoup(html.read(), "lxml")
def output_data(soup, filename, fieldnames, rows_with_info, supervision_row_count):
with open(filename, 'a', newline='') as csvfile:
mywriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
mydict = {};
# Scraping first table
offender_table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="offender-search-details")
rows = offender_table.findAll(lambda tag: tag.name=='tr')
mydict[fieldnames[0]]= clean_data(name_and_id[0])
mydict[fieldnames[1]]= clean_data(name_and_id[1])
#lots of similar code removed for sake of brevity
mywriter.writerow(mydict)
start_id = 10
max_id = 199999
for the_id in range(start_id, max_id):
logger.info('running with id-' + str(the_id))
theurl=thebaseurl + str(the_id)
soup = get_soup(theurl)
sentence_table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="offender-search-sentence-info")
if sentence_table:
logger.info('found data for id-' + str(the_id))
sentence_rows = sentence_table.findAll(lambda tag: tag.name=='tr')
supervision_row_count = 0
for the_row_index in range(0, len(sentence_rows)):
col_count = sentence_rows[the_row_index].find_all('td')
if (len(col_count) == 2):
supervision_row_count = supervision_row_count + 1
supervision_row_count = supervision_row_count -1
rows_with_info = len(sentence_rows) - 4 - supervision_row_count
output_data(soup, filename, fieldnames, rows_with_info, supervision_row_count)
logger.info('finished-' + str(datetime.datetime.now()))
I have been struggling with this for over a week now. I am trying to learn Python and build something that will be useful to me at the sametime - something to help me find a new home to rent.
I have all the code working the way I want it - except I can not get all 550 properties, I can only get the first 25 on page 1. I have tried several methods but nothing seems to work.
If I use urlopen and take the main url + the regex of the page number "2_p/" for example I get a urlerror unknown url "h'.
If I use webdriver- firefox tries to look up www.h.com. I really could use a little help. Attached is my code- sorry it is kinda messy and probably a little long - I am just learning, so don't be to cruel.
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re
pages1 = set()
next_page = ()
csv_output = [ ]
def getLinks(url):
global pages
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages1:
newPage = addr_link.attrs['href']
pages1.add(newPage)
#print(newPage)
print(len(pages1))
for link in pages1:
property_url = link
getData(property_url)
def getData(url):
base_url = 'http://www.zillow.com'
final_url = base_url+url
html = urlopen(final_url)
bsObj = BeautifulSoup(html)
try:
# Gets Property Address
address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
s_address = address.get_text()
print(address)
# Gets number of bedsrooms
beds = bsObj.find("span", {"class":"addr_bbs"})
s_beds = beds.get_text()
# Gets number of bathsrooms
baths = beds.find_next("span", {"class":"addr_bbs"})
s_baths = baths.get_text()
# Gets sqft
sqft = baths.find_next("span", {"class":"addr_bbs"})
s_sqft = sqft.get_text()
# Gets rent_amount
rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
s_rent_amount = rent_amount.get_text()
# Gets rent_zestiment
zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
rent_zestiment = zestiment_holder.find_next("span")
s_rent_zestiment = rent_zestiment.get_text()
# Gets Date posted on Zillow
for time_posted in bsObj(text=re.compile("Posted")):
posted = time_posted.parent.get_text()
if 'hours' in posted:
date_on_zillow = datetime.date.today()
posted_date = date_on_zillow
else:
days_subtracted = int(re.search(r'\d+', posted).group())
posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)
# Gets Property subdivision
subdivision = bsObj.find(id="hdp-neighborhood").h2
s_subdivision = subdivision.get_text()
# Gets property_manager_name
property_manager_name = bsObj.find("span", {"class":"snl company-name"})
s_property_manager_name = property_manager_name.get_text()
# Gets property_manager_phone
property_manager_phone = bsObj.find("span", {"class":"snl phone"})
s_property_manager_phone = property_manager_phone.get_text()
# Gets disc_of_property
disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
s_disc_of_property = disc_of_property.encode("utf-8")
# Gets url_of_listing so I can see Photos if interested
main_url = 'http://www.zillow.com'
url2 = url
listing_url = main_url+url2
s_listing_url = listing_url
except AttributeError as e:
return None
csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]
csv_output.append(csv_data)
resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()
header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)
getLinks("http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.371994,-93.635788,38.697836,-95.077744_rect/9_zm/")
Edit:
The 'while' that has been commented out was my last attempt at handling the pagination.