How to write file in good format with python? - python

I'm on a project and trying to write file in python and I'm trying to write a file in nice format. I'd try a lot but I don't know what's going wrong?
I'd try:
def generate_file(self, lyrics):
self.path()
print('We are writing file ............')
with open('filename.srt', 'w') as lrc:
for i in range(len(lyrics)):
add = ''
if lyrics[i].isnumeric():
add += '\n'
elif lyrics[i].isalpha():
add += '\n\n'
lrc.write(lyrics[i]+add)
add += ''
lrc.close()
print('We downloaded your file!')
Output:
000:00:00‚000 --> 00:00:00‚000by RentAnAdviser.com100:00:22‚608 --> 00:00:26‚607Drink from me drink fromme oh ah oh ah200:00:26‚803 --> 00:00:30‚602Then we′ll shoot across the symphony300:00:30‚808 --> 00:00:38‚807Then we′ll shoot across the sky400:00:43‚599 --> 00:00:48‚498Oh angels sent from up above500:00:48‚702 --> 00:00:53‚801You know you make my world light up600:00:54‚005 --> 00:00:59‚004When I was down when I was hurt700:00:59‚218 --> 00:01:04‚717You came to lift me up800:01:04‚911 --> 00:01:09‚610Life is a drink and love′s a ****900:01:09‚812 --> 00:01:15‚011Oh now I think I must be miles up1000:01:15‚217 --> 00:01:20‚316When I was hurt withered dried up1100:01:20‚506 --> 00:01:26‚005You came to rain a flood1200:01:26‚217 --> 00:01:28‚716So drink from me drink from me1300:01:28‚900 -
I excepted:
0
00:00:00,000 --> 00:00:00,000
by RentAnAdviser.com
1
00:00:17,842 --> 00:00:21,341
Drink from me‚ drink from me
2
00:00:21,537 --> 00:00:23,336
Then we′ll shoot across the sky
3
00:00:23,546 --> 00:00:24,545
Drink from me‚ drink from me
How can I do that?
My project:
from bs4 import BeautifulSoup
import os, requests, platform
class EpicLyricFinderApp:
def __init__(self):
self.text = '%20'.join(input('Enter song name and also include singer: ').split(' '))
self.url = 'https://www.rentanadviser.com/en/subtitles/subtitles4songs.aspx?src='+self.text
self.user = None
self.app()
def app(self):
req = requests.get(self.url).content
soup = BeautifulSoup(req, 'html.parser')
print('Please wait ...................')
tag = soup.findAll('table')
link = [('https://www.rentanadviser.com/en/subtitles/'+l.get('href'))+'&type=srt' for l in [a.find('a') for a in tag]]
blank_name = [''.join((l.get_text()).split(' ')[17:]) for l in [a.find('a') for a in tag]]
[print('No. {} ==>> {}'.format(name+1,blank_name[name])) for name in range(len(blank_name))]
# Get input form user to choice lyrics
print('='*60)
while True:
try:
self.user = int(input('Which lyrics you wanna download?: '))
except ValueError:
continue
else:
break
# Open .srt link
req1 = requests.get(link[self.user]).content
soup1 = BeautifulSoup(req1, 'html.parser')
lyrics = [c.get_text() for c in soup1.findAll('span', attrs={'id':'ctl00_ContentPlaceHolder1_lblSubtitle'})]
self.generate_file(lyrics)
#staticmethod
def path():
if platform.system()=='Linux':
linux = '/home/rohit/Desktop/lrc'
if os.path.exists(linux):
os.chdir(linux)
else:
os.mkdir(linux)
os.chdir(linux)
else:
windows = 'Cd:/Users/ABC/rohit/Desktop/lrc'
if os.path.exists(windows):
os.chdir(windows)
else:
os.mkdir(windows)
os.chdir(windows)
def generate_file(self, lyrics):
self.path()
print('We are writing file ............')
with open('_'.join(self.text.split('%20'))+'.srt', 'w') as lrc:
for i in range(len(lyrics)):
add = ''
if lyrics[i].isnumeric():
add += '\n'
elif lyrics[i].isalpha():
add += '\n\n'
lrc.write(lyrics[i]+add)
add += ''
lrc.close()
print('We downloaded your file!')
if __name__ == '__main__':
app = EpicLyricFinderApp()

Related

Need Wikipedia web scraper to continuously ask for user input

I need the below code to ask for user input again, after executing and showing results. I guess a while loop would be best but not sure how to do it as have BeautifulSoup and requests library in use.
Any help would be greatly appreciated.
from bs4 import BeautifulSoup
user_input = input("Enter article:")
response = requests.get("https://en.wikipedia.org/wiki/" + user_input)
soup = BeautifulSoup(response.text, "html.parser")
list = []
count = 0
IGNORE = ["Wikipedia:", "Category:", "Template:", "Template talk:", "User:",
"User talk:", "Module:", "Help:", "File:", "Portal:", "#", "About this", ".ogg", "disambiguation", "Edit section"]
for tag in soup.select('div.mw-parser-output a:not(.infobox a)'):
if count <= 10:
title = tag.get("title", "")
if not any(x in title for x in IGNORE) and title != "":
count = count + 1
print(title)
list.append(title)
else:
break
Use function with return statement
Example
import requests
from bs4 import BeautifulSoup
IGNORE = ["Wikipedia:", "Category:", "Template:", "Template talk:", "User:",
"User talk:", "Module:", "Help:", "File:", "Portal:", "#", "About this", ".ogg", "disambiguation",
"Edit section"]
def get_user_input():
user_input = input("Enter article:")
if len(str(user_input)) > 0:
return get_response(user_input)
else:
return get_user_input()
def get_response(user_input):
response = requests.get("https://en.wikipedia.org/wiki/" + user_input)
soup = BeautifulSoup(response.text, "html.parser")
title_list = []
count = 0
for tag in soup.select('div.mw-parser-output a:not(.infobox a)'):
if count <= 10:
title = tag.get("title", "")
if not any(x in title for x in IGNORE) and title != "":
count = count + 1
print(title)
title_list.append(title)
print(title_list)
else:
return get_user_input()
if __name__ == '__main__':
get_user_input()

how to travel to next page of the website I have an issue to navigate to next page

please anyone one help me. I'm trying to navigate a pages but I don't know my code is not working. I got a product details for the 1st page I want to scrape the details for all the pages in the website. below is my code please check for your reference. thanks in advance
below is link for the website
https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()

Finding when a webpage updates via Python?

So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)
You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr

Youtube API Handling Deleted video error

I have written code to get playlist and the video lists within them in different text files:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
YouTube Playlist Extrator.
A tool to extract playlists from YouTube API which in todays YouTube page format is very difficult to extract.
It also extracts video lists per playlist and hence takes bit longer to run for long playlists.
"""
#from profiler import Profiler
from xml.dom.minidom import parseString
import os
try:
import urllib.request as urlLibReq
PY3 = True
except:
import urllib as urlLibReq
PY3 = False
def getInput():
if PY3:
return input("Enter username of YouTube channel: ")
elif not PY3:
return raw_input("Enter username of YouTube channel: ")
def xmlParser(url):
page = urlLibReq.urlopen(url)
text = page.read().decode("utf8")
return parseString(text)
def extractplaylist(userId):
url = "https://gdata.youtube.com/feeds/api/users/"+ userId +"/playlists?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
listEntry.sort()
writer = open(userId+"_playlist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
def extractvideolist(userId, playlist_id, playlist_title):
url = "http://gdata.youtube.com/feeds/api/playlists/"+ playlist_id +"?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
video_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
listEntry.append(str(video_title))
startIndex += 1
playlist_title = playlist_title.replace("'","\'")
writer = open(playlist_title+"_videolist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
print("written", playlist_title)
try: os.mkdir(userId)
except: pass
os.system('mv "'+ playlist_title +'_videolist.txt" '+ userId)
if __name__ == "__main__":
name = getInput()
extractplaylist(name)
#Profiler.report()
The code fails when there is a deleted video in the playlist. How do I deal with such a thing?
Try adding an else clause to your for loop to break out of the while loop when the for loop ends.
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
else:
break

Gevent link crawler

Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?
Code
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Update 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:
import gevent
from gevent import monkey; monkey.patch_all()
(you can selectively decide what has to be monkey-patched, but let's say it is not
your problem at the moment)
In your run, make your open function to be called inside a greenlet. run can
return the greenlet object, so you can wait for it whenever you need to get the
results using gevent.joinall for example. Something like this:
def run(self):
return gevent.spawn(self.open)
c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)
print [c.links for c in (c1, c2, c3)]

Categories

Resources