Need Wikipedia web scraper to continuously ask for user input - python

I need the below code to ask for user input again, after executing and showing results. I guess a while loop would be best but not sure how to do it as have BeautifulSoup and requests library in use.
Any help would be greatly appreciated.
from bs4 import BeautifulSoup
user_input = input("Enter article:")
response = requests.get("https://en.wikipedia.org/wiki/" + user_input)
soup = BeautifulSoup(response.text, "html.parser")
list = []
count = 0
IGNORE = ["Wikipedia:", "Category:", "Template:", "Template talk:", "User:",
"User talk:", "Module:", "Help:", "File:", "Portal:", "#", "About this", ".ogg", "disambiguation", "Edit section"]
for tag in soup.select('div.mw-parser-output a:not(.infobox a)'):
if count <= 10:
title = tag.get("title", "")
if not any(x in title for x in IGNORE) and title != "":
count = count + 1
print(title)
list.append(title)
else:
break

Use function with return statement
Example
import requests
from bs4 import BeautifulSoup
IGNORE = ["Wikipedia:", "Category:", "Template:", "Template talk:", "User:",
"User talk:", "Module:", "Help:", "File:", "Portal:", "#", "About this", ".ogg", "disambiguation",
"Edit section"]
def get_user_input():
user_input = input("Enter article:")
if len(str(user_input)) > 0:
return get_response(user_input)
else:
return get_user_input()
def get_response(user_input):
response = requests.get("https://en.wikipedia.org/wiki/" + user_input)
soup = BeautifulSoup(response.text, "html.parser")
title_list = []
count = 0
for tag in soup.select('div.mw-parser-output a:not(.infobox a)'):
if count <= 10:
title = tag.get("title", "")
if not any(x in title for x in IGNORE) and title != "":
count = count + 1
print(title)
title_list.append(title)
print(title_list)
else:
return get_user_input()
if __name__ == '__main__':
get_user_input()

Related

Extract specific text from a list in Python

I am trying to extract certain information from a long list of text do display it nicely but i cannot seem to figure out how exactly to tackle this problem.
My text is as follows:
"(Craw...Crawley\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\n**Hotstage**\n **248236**\n\n\n\n\n\n\n\n\n\n\n\n\n\nCosta Collect...Costa Coffee (Bedf...Bedford\n\n\n\n\n\n\n08:00\n\n\n\n \n\n\n**Hotstage**\n **247962**\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Sheffield Qu...Sheffield\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\nHotstage\n 247971\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Brentford...BRENTFORD\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\nHotstage\n 248382\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Newport"
I would like to extract what is highlighted.
I'm thinking the solution is simple and maybe I am not storing the information properly or not extracting it properly.
This is my code
from bs4 import BeautifulSoup
import requests
import re
import time
def main():
url = "http://antares.platinum-computers.com/schedule.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
response.close()
# Get
tech_count = 0
technicians = [] #List to hold technicians names
xcount = 0
test = 0
name_links = soup.find_all('td', {"class": "resouce_on"}) #Get all table data with class name "resource on".
# iterate through html data and add them to "technicians = []"
for i in name_links:
technicians.append(str(i.text.strip())) # append value to dictionary
tech_count += 1
print("Found: " + str(tech_count) + " technicians + 1 default unallocated.")
for t in technicians:
print(xcount,t)
xcount += 1
test = int(input("choose technician: "))
for link in name_links:
if link.find(text=re.compile(technicians[test])):
jobs = []
numbers = []
unique_cr = []
jobs.append(link.parent.text.strip())
for item in jobs:
for subitem in item.split():
if(subitem.isdigit()):
numbers.append(subitem)
for number in numbers:
if number not in unique_cr:
unique_cr.append(number)
print ("tasks for technician " + str(technicians[test]) + " are as follows")
for cr in unique_cr:
print (jobs)
if __name__ == '__main__':
main()
It's fairly simple:
myStr = "your complicated text"
words = mystr.split("\n")
niceWords = []
for word in words:
If "**"in word:
niceWords.append(word.replace("**", "")
print(niceWords)

How to write file in good format with python?

I'm on a project and trying to write file in python and I'm trying to write a file in nice format. I'd try a lot but I don't know what's going wrong?
I'd try:
def generate_file(self, lyrics):
self.path()
print('We are writing file ............')
with open('filename.srt', 'w') as lrc:
for i in range(len(lyrics)):
add = ''
if lyrics[i].isnumeric():
add += '\n'
elif lyrics[i].isalpha():
add += '\n\n'
lrc.write(lyrics[i]+add)
add += ''
lrc.close()
print('We downloaded your file!')
Output:
000:00:00‚000 --> 00:00:00‚000by RentAnAdviser.com100:00:22‚608 --> 00:00:26‚607Drink from me drink fromme oh ah oh ah200:00:26‚803 --> 00:00:30‚602Then we′ll shoot across the symphony300:00:30‚808 --> 00:00:38‚807Then we′ll shoot across the sky400:00:43‚599 --> 00:00:48‚498Oh angels sent from up above500:00:48‚702 --> 00:00:53‚801You know you make my world light up600:00:54‚005 --> 00:00:59‚004When I was down when I was hurt700:00:59‚218 --> 00:01:04‚717You came to lift me up800:01:04‚911 --> 00:01:09‚610Life is a drink and love′s a ****900:01:09‚812 --> 00:01:15‚011Oh now I think I must be miles up1000:01:15‚217 --> 00:01:20‚316When I was hurt withered dried up1100:01:20‚506 --> 00:01:26‚005You came to rain a flood1200:01:26‚217 --> 00:01:28‚716So drink from me drink from me1300:01:28‚900 -
I excepted:
0
00:00:00,000 --> 00:00:00,000
by RentAnAdviser.com
1
00:00:17,842 --> 00:00:21,341
Drink from me‚ drink from me
2
00:00:21,537 --> 00:00:23,336
Then we′ll shoot across the sky
3
00:00:23,546 --> 00:00:24,545
Drink from me‚ drink from me
How can I do that?
My project:
from bs4 import BeautifulSoup
import os, requests, platform
class EpicLyricFinderApp:
def __init__(self):
self.text = '%20'.join(input('Enter song name and also include singer: ').split(' '))
self.url = 'https://www.rentanadviser.com/en/subtitles/subtitles4songs.aspx?src='+self.text
self.user = None
self.app()
def app(self):
req = requests.get(self.url).content
soup = BeautifulSoup(req, 'html.parser')
print('Please wait ...................')
tag = soup.findAll('table')
link = [('https://www.rentanadviser.com/en/subtitles/'+l.get('href'))+'&type=srt' for l in [a.find('a') for a in tag]]
blank_name = [''.join((l.get_text()).split(' ')[17:]) for l in [a.find('a') for a in tag]]
[print('No. {} ==>> {}'.format(name+1,blank_name[name])) for name in range(len(blank_name))]
# Get input form user to choice lyrics
print('='*60)
while True:
try:
self.user = int(input('Which lyrics you wanna download?: '))
except ValueError:
continue
else:
break
# Open .srt link
req1 = requests.get(link[self.user]).content
soup1 = BeautifulSoup(req1, 'html.parser')
lyrics = [c.get_text() for c in soup1.findAll('span', attrs={'id':'ctl00_ContentPlaceHolder1_lblSubtitle'})]
self.generate_file(lyrics)
#staticmethod
def path():
if platform.system()=='Linux':
linux = '/home/rohit/Desktop/lrc'
if os.path.exists(linux):
os.chdir(linux)
else:
os.mkdir(linux)
os.chdir(linux)
else:
windows = 'Cd:/Users/ABC/rohit/Desktop/lrc'
if os.path.exists(windows):
os.chdir(windows)
else:
os.mkdir(windows)
os.chdir(windows)
def generate_file(self, lyrics):
self.path()
print('We are writing file ............')
with open('_'.join(self.text.split('%20'))+'.srt', 'w') as lrc:
for i in range(len(lyrics)):
add = ''
if lyrics[i].isnumeric():
add += '\n'
elif lyrics[i].isalpha():
add += '\n\n'
lrc.write(lyrics[i]+add)
add += ''
lrc.close()
print('We downloaded your file!')
if __name__ == '__main__':
app = EpicLyricFinderApp()

Web crawler not able to process more than one webpage

I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")
One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.
I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")

When going through a for loop, if the variable is true it prints twice when it false it only prints once

I was making a checker for opinions on a website and when going trough it, if they both matched, the text would print twice and when it didn't match it would simply print once, I've been trying to figure out how to simply print the true matches and to only print them once.
The CMD output looks like this:
http://prntscr.com/h3ioli
import cfscrape, re, os, time
from bs4 import BeautifulSoup
cc = open('cookie.txt').read()
mybbuser, sid = cc.split(':')
MainScrapper = cfscrape.create_scraper()
def substring_after(string, delim,back):
return string.partition(delim)[back]
suspect = raw_input('User ID: ')
def reputationCheck(userid):
reputationlist = []
r = MainScrapper.get('https://v3rmillion.net/reputation.php?uid={}&show=positive'.format(userid), cookies={'mybbuser': mybbuser,'sid': sid})
soup = BeautifulSoup(r.text, 'html.parser')
reputations = soup.find_all('a', href=re.compile("member\.php\?action=profile\&uid=(\d+)"))
for reputation in reputations:
reputationlist = reputationlist + [substring_after(reputation['href'],'uid=', 2)]
if soup.find('span', {'class' : 'pages'}):
pages = soup.find('span', {'class' : 'pages'}).text
pages = substring_after(pages, '(', 2)
pages = substring_after(pages, '):', 0)
soup = BeautifulSoup(r.text, 'html.parser')
for x in range(1, (int(pages))):
r = MainScrapper.get('https://v3rmillion.net/reputation.php?uid={}'.format(userid) + '&show=positive&page={}'.format(x + 1), cookies={'mybbuser': mybbuser,'sid': sid})
soup = BeautifulSoup(r.text, 'html.parser')
reputations = soup.find_all('a', href=re.compile("member\.php\?action=profile\&uid=(\d+)"))
for reputation in reputations:
if not reputation == suspect:
reputationlist = reputationlist + [substring_after(reputation['href'],'uid=', 2)]
for userids in reputationlist:
if not str(userids) == str(suspect):
victim = []
r = MainScrapper.get('https://v3rmillion.net/reputation.php?uid={}'.format(userids) + '&show=positive', cookies={'mybbuser': mybbuser,'sid': sid})
soup = BeautifulSoup(r.text, 'html.parser')
reputations = soup.find_all('a', href=re.compile("member\.php\?action=profile\&uid=(\d+)"))
for reputation in reputations:
if substring_after(reputation['href'],'uid=', 2) == str(suspect):
print(str(userids) + 'exchanged reputation with ' + str(suspect))
else:
pass
if not reputation == suspect:
if not str(userids) == str(suspect):
These should be:
if reputation != suspect:
if str(userids) != str(suspect):
Maybe, you should put your print function outside the loop.
Something like:
a=['x','y','z']
c=''
for b in a:
c+=b
print('this is inside loop, create multiple print: '+c)
print('this is outside loop, create single print, get it: '+c)

All Links not retrieved from webpage -- python

I would like the user to give search type choices , search text and then display all the links in the resulting webpage . but i am not able to retrieve the resulting links(only home link is retrieved) from the webpage (http://djraag.net/bollywood/searchbycat.php)
from bs4 import BeautifulSoup
import urllib.request
import re
print(" 1 : Album \n 2 : Track \n 3 : Artist \n ")
count = 0
while (count == 0):
temp_str = input('Enter yout search type : ')
temp = int(temp_str)
if (temp >= 1)&(temp <= 3):
search_no = temp
count = 1
else :
print("Invalid Input")
if search_no == 1 :
search_type1 = "album"
search_type = str(search_type1)
elif search_no == 2:
search_type1 = "track"
search_type = str(search_type1)
else:
search_type1 = "artist"
search_type = str(search_type1)
Search = input("Search : ")
url_temp = "http://djraag.net/bollywood/searchbycat.php?search="+Search+"&type="+search_type+"&cat_id=5&submit=Submit"
url = urllib.request.urlopen(url_temp)
content = url.read()
soup = BeautifulSoup(content ,"html.parser")
for a in soup.findAll('a',href=True):
if re.findall('http',a['href']):
print ("URL:", a['href'])
Remove the line
if re.findall('http',a['href']):
from code and try again.

Categories

Resources