I have written code to get playlist and the video lists within them in different text files:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
YouTube Playlist Extrator.
A tool to extract playlists from YouTube API which in todays YouTube page format is very difficult to extract.
It also extracts video lists per playlist and hence takes bit longer to run for long playlists.
"""
#from profiler import Profiler
from xml.dom.minidom import parseString
import os
try:
import urllib.request as urlLibReq
PY3 = True
except:
import urllib as urlLibReq
PY3 = False
def getInput():
if PY3:
return input("Enter username of YouTube channel: ")
elif not PY3:
return raw_input("Enter username of YouTube channel: ")
def xmlParser(url):
page = urlLibReq.urlopen(url)
text = page.read().decode("utf8")
return parseString(text)
def extractplaylist(userId):
url = "https://gdata.youtube.com/feeds/api/users/"+ userId +"/playlists?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
listEntry.sort()
writer = open(userId+"_playlist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
def extractvideolist(userId, playlist_id, playlist_title):
url = "http://gdata.youtube.com/feeds/api/playlists/"+ playlist_id +"?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
video_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
listEntry.append(str(video_title))
startIndex += 1
playlist_title = playlist_title.replace("'","\'")
writer = open(playlist_title+"_videolist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
print("written", playlist_title)
try: os.mkdir(userId)
except: pass
os.system('mv "'+ playlist_title +'_videolist.txt" '+ userId)
if __name__ == "__main__":
name = getInput()
extractplaylist(name)
#Profiler.report()
The code fails when there is a deleted video in the playlist. How do I deal with such a thing?
Try adding an else clause to your for loop to break out of the while loop when the for loop ends.
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
else:
break
Related
Im making a code that scrapes a website for lyrics on a bunch of artists and saves the lyrics as .txt files in a directories named after their respective albums.
But after my program has finished the first artist, it keeps looping the same artist. Why?
Code:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/"+str(artist)):
os.mkdir("D:/Folder/"+str(artist))
link=urlhome+str(artist[0])+"/"+artist.replace(" ","+")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0])+"/"+artist.replace(" ","+") in album["href"]:
albumurl = "https://www.lyricsfreak.com"+album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip()+" "+albumyear
if not os.path.exists("D:/Folder/"+str(artist)+"/"+albumname):
os.mkdir("D:/Folder/"+str(artist)+"/"+albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com"+song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist+"/"+albumname+"/"+(songname)+".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] + '\n')
print("parsing "+str(songname))
The block of code after:
if str(artist[0])+"/"+artist.replace(" ","+") in album["href"]:
albumurl = "https://www.lyricsfreak.com"+album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip()+" "+albumyear
Needs to be indented to be included with in that condition statement. Other wise, it just skips that little block and then just repeats everything on the last albumurl stored string.
Full Code:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/"+str(artist)):
os.mkdir("D:/Folder/"+str(artist))
link=urlhome+str(artist[0])+"/"+artist.replace(" ","+")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0])+"/"+artist.replace(" ","+") in album["href"]:
albumurl = "https://www.lyricsfreak.com"+album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip()+" "+albumyear
if not os.path.exists("D:/Folder/"+str(artist)+"/"+albumname): #<-- INDENT REST OF CODE
os.mkdir("D:/Folder/"+str(artist)+"/"+albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com"+song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist+"/"+albumname+"/"+(songname)+".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] + '\n')
print("parsing "+str(songname))
So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)
You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr
I'm on a project and trying to write file in python and I'm trying to write a file in nice format. I'd try a lot but I don't know what's going wrong?
I'd try:
def generate_file(self, lyrics):
self.path()
print('We are writing file ............')
with open('filename.srt', 'w') as lrc:
for i in range(len(lyrics)):
add = ''
if lyrics[i].isnumeric():
add += '\n'
elif lyrics[i].isalpha():
add += '\n\n'
lrc.write(lyrics[i]+add)
add += ''
lrc.close()
print('We downloaded your file!')
Output:
000:00:00‚000 --> 00:00:00‚000by RentAnAdviser.com100:00:22‚608 --> 00:00:26‚607Drink from me drink fromme oh ah oh ah200:00:26‚803 --> 00:00:30‚602Then we′ll shoot across the symphony300:00:30‚808 --> 00:00:38‚807Then we′ll shoot across the sky400:00:43‚599 --> 00:00:48‚498Oh angels sent from up above500:00:48‚702 --> 00:00:53‚801You know you make my world light up600:00:54‚005 --> 00:00:59‚004When I was down when I was hurt700:00:59‚218 --> 00:01:04‚717You came to lift me up800:01:04‚911 --> 00:01:09‚610Life is a drink and love′s a ****900:01:09‚812 --> 00:01:15‚011Oh now I think I must be miles up1000:01:15‚217 --> 00:01:20‚316When I was hurt withered dried up1100:01:20‚506 --> 00:01:26‚005You came to rain a flood1200:01:26‚217 --> 00:01:28‚716So drink from me drink from me1300:01:28‚900 -
I excepted:
0
00:00:00,000 --> 00:00:00,000
by RentAnAdviser.com
1
00:00:17,842 --> 00:00:21,341
Drink from me‚ drink from me
2
00:00:21,537 --> 00:00:23,336
Then we′ll shoot across the sky
3
00:00:23,546 --> 00:00:24,545
Drink from me‚ drink from me
How can I do that?
My project:
from bs4 import BeautifulSoup
import os, requests, platform
class EpicLyricFinderApp:
def __init__(self):
self.text = '%20'.join(input('Enter song name and also include singer: ').split(' '))
self.url = 'https://www.rentanadviser.com/en/subtitles/subtitles4songs.aspx?src='+self.text
self.user = None
self.app()
def app(self):
req = requests.get(self.url).content
soup = BeautifulSoup(req, 'html.parser')
print('Please wait ...................')
tag = soup.findAll('table')
link = [('https://www.rentanadviser.com/en/subtitles/'+l.get('href'))+'&type=srt' for l in [a.find('a') for a in tag]]
blank_name = [''.join((l.get_text()).split(' ')[17:]) for l in [a.find('a') for a in tag]]
[print('No. {} ==>> {}'.format(name+1,blank_name[name])) for name in range(len(blank_name))]
# Get input form user to choice lyrics
print('='*60)
while True:
try:
self.user = int(input('Which lyrics you wanna download?: '))
except ValueError:
continue
else:
break
# Open .srt link
req1 = requests.get(link[self.user]).content
soup1 = BeautifulSoup(req1, 'html.parser')
lyrics = [c.get_text() for c in soup1.findAll('span', attrs={'id':'ctl00_ContentPlaceHolder1_lblSubtitle'})]
self.generate_file(lyrics)
#staticmethod
def path():
if platform.system()=='Linux':
linux = '/home/rohit/Desktop/lrc'
if os.path.exists(linux):
os.chdir(linux)
else:
os.mkdir(linux)
os.chdir(linux)
else:
windows = 'Cd:/Users/ABC/rohit/Desktop/lrc'
if os.path.exists(windows):
os.chdir(windows)
else:
os.mkdir(windows)
os.chdir(windows)
def generate_file(self, lyrics):
self.path()
print('We are writing file ............')
with open('_'.join(self.text.split('%20'))+'.srt', 'w') as lrc:
for i in range(len(lyrics)):
add = ''
if lyrics[i].isnumeric():
add += '\n'
elif lyrics[i].isalpha():
add += '\n\n'
lrc.write(lyrics[i]+add)
add += ''
lrc.close()
print('We downloaded your file!')
if __name__ == '__main__':
app = EpicLyricFinderApp()
I am beginner for Python,
How I can solve
AttributeError: module 'urllib' has no attribute 'Request'
As I view other post, still can't understand how solve the problem
Here the screen capture of the error
And this is the code (I refer from https://github.com/minimaxir/facebook-page-post-scraper/blob/master/get_fb_posts_fb_page.py)
import urllib.request
import json, datetime, csv, time
app_id = "xxx"
app_secret = "xxx" # DO NOT SHARE WITH ANYONE!
access_token = "xxx"
page_id = 'xxx'
def testFacebookPageData(page_id, access_token):
# construct the URL string
base = "https://graph.facebook.com/v2.4"
node = "/" + page_id +'/feed'
parameters = "/?access_token=%s" % access_token
url = base + node + parameters
# retrieve data
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))
print (data)
def request_until_succeed(url):
req = urllib.request.urlopen(url)
success = False
while success is False:
try:
response = urllib.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print (e)
time.sleep(5)
print (url, datetime.datetime.now())
return response.read()
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# construct the URL string
base = "https://graph.facebook.com"
node = "/" + page_id + "/feed"
parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
url = base + node + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def processFacebookPageFeedStatus(status):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
status_type = status['type']
status_link = '' if 'link' not in status.keys() else status['link']
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
# return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_likes, num_comments, num_shares)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print (num_processed, datetime.datetime.now() - scrape_starttime)
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
There is no urllib.Request() in Python 3 - there is urllib.request.Request().
EDIT: you have url = urllib.Request(url) in error message but I don't see this line in your code - maybe you run wrong file.
I am trying to obtain the number of calendar resources per domain using the Calendar Resources API , and the code is returning what's deemed as bad results. Specifically, the code insists that all domains have the same calendar count.
I have two functions to do that, both returning the same (bad) results:
def getCalendarCountFor(domain, userMail, password):
client = CalendarResourceClient(domain=domain)
client.ClientLogin(userMail, password, "test_app")
calendar_resources = client.GetResourceFeed()
return len(calendar_resources.entry)
The second version:
def GoogleQueryCalendars(dom, admin_id, admin_pwd):
today = datetime.datetime.now().strftime("%Y-%m-%d %H:%S")
calendarClient = CalendarResourceClient(domain=dom)
calendarClient.ClientLogin(email=admin_id, password=admin_pwd, source='TheSource')
resourceCount = 0
# loop through all the calendar feeds
try:
moreCalendars =calendarClient.GetResourceFeed()
except:
print "Exception"
calendars = {}
while moreCalendars.entry is not None:
for i, cal in enumerate(moreCalendars.entry):
str = cal.GetResourceCommonName()
pseudoDomain = re.sub("[^A-Z\d]", "", re.search("^[^-\s]*", str).group(0)).lower()
if pseudoDomain in calendars:
calendars[pseudoDomain] +=1
else:
calendars[pseudoDomain] =1
resourceCount +=1
try:
moreCalendars = calendarClient.GetNext(moreCalendars)
except:
break
return resourceCount
Thanks.
Here's a method for counting calendar resources.
def count_resources(domain, email, password):
client = CalendarResourceClient(domain=domain)
client.ClientLogin(email=email,
password=password,
source='TheSource')
count = 0
uri = client.MakeResourceFeedUri()
while uri:
feed = client.GetResourceFeed(uri)
count += len(feed.entry)
next_link = feed.GetNextLink()
uri = next_link.href if next_link else None
return count