I am trying to extract certain information from a long list of text do display it nicely but i cannot seem to figure out how exactly to tackle this problem.
My text is as follows:
"(Craw...Crawley\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\n**Hotstage**\n **248236**\n\n\n\n\n\n\n\n\n\n\n\n\n\nCosta Collect...Costa Coffee (Bedf...Bedford\n\n\n\n\n\n\n08:00\n\n\n\n \n\n\n**Hotstage**\n **247962**\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Sheffield Qu...Sheffield\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\nHotstage\n 247971\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Brentford...BRENTFORD\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\nHotstage\n 248382\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Newport"
I would like to extract what is highlighted.
I'm thinking the solution is simple and maybe I am not storing the information properly or not extracting it properly.
This is my code
from bs4 import BeautifulSoup
import requests
import re
import time
def main():
url = "http://antares.platinum-computers.com/schedule.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
response.close()
# Get
tech_count = 0
technicians = [] #List to hold technicians names
xcount = 0
test = 0
name_links = soup.find_all('td', {"class": "resouce_on"}) #Get all table data with class name "resource on".
# iterate through html data and add them to "technicians = []"
for i in name_links:
technicians.append(str(i.text.strip())) # append value to dictionary
tech_count += 1
print("Found: " + str(tech_count) + " technicians + 1 default unallocated.")
for t in technicians:
print(xcount,t)
xcount += 1
test = int(input("choose technician: "))
for link in name_links:
if link.find(text=re.compile(technicians[test])):
jobs = []
numbers = []
unique_cr = []
jobs.append(link.parent.text.strip())
for item in jobs:
for subitem in item.split():
if(subitem.isdigit()):
numbers.append(subitem)
for number in numbers:
if number not in unique_cr:
unique_cr.append(number)
print ("tasks for technician " + str(technicians[test]) + " are as follows")
for cr in unique_cr:
print (jobs)
if __name__ == '__main__':
main()
It's fairly simple:
myStr = "your complicated text"
words = mystr.split("\n")
niceWords = []
for word in words:
If "**"in word:
niceWords.append(word.replace("**", "")
print(niceWords)
Related
from bs4 import BeautifulSoup
import requests
first = ()
first_slice = ()
last = ()
def askname():
global first
first = input(str("First Name of Player?"))
global last
last = input(str("Last Name of Player?"))
print("Confirmed, loading up " + first + " " + last)
# asks user for player name
askname()
first_slice_result = (first[:2])
last_slice_result = (last[:5])
print(first_slice_result)
print(last_slice_result)
# slices player's name so it can match the format bref uses
first_slice_resultA = str(first_slice_result)
last_slice_resultA = str(last_slice_result)
first_last_slice = last_slice_resultA + first_slice_resultA
lower = first_last_slice.lower() + "01"
start_letter = (last[:1])
lower_letter = (start_letter.lower())
# grabs the letter bref uses for organization
print(lower)
source = requests.get('https://www.basketball-reference.com/players/' + lower_letter + '/' + lower + '.html').text
soup = BeautifulSoup(source, 'lxml')
tbody = soup.find('tbody')
pergame = tbody.find(class_="full_table")
classrite = tbody.find(class_="right")
tr_body = tbody.find_all('tr')
# lprint(pergame)
for td in tbody:
print(td.get_text)
print("done")
get = str(input("What stat? \nCheck commands.txt for statistic names. \n"))
for trb in tr_body:
print(trb.get('id'))
print("\n")
th = trb.find('th')
print(th.get_text())
print(th.get('data-stat'))
row = {}
for td in trb.find_all('td'):
row[td.get('data-stat')] = td.get_text()
print(row[get])
So I have this program that scrapes divs based on their given a "data-stat" value. (pg_per_mp etc)
However right now I can only get that data-stat value from either assigning it a variable or getting it from an input. I would like to make a list of data-stats and grab all the values from each data-stat in the list.
for example
list = [fga_per_mp, fg3_per_mp, ft_per_mp]
for x in list:
print(x)
In a perfect world, the script would take each value of the list and scrape the website for the assigned stat.
I tried editing line 66 - 79 to:
get = [fga_per_mp, fg3_per_mp]
for trb in tr_body:
print(trb.get('id'))
print("\n")
th = trb.find('th')
print(th.get_text())
print(th.get('data-stat'))
row = {}
for td in trb.find_all('td'):
for x in get():
row[td.get('data-stat')] = td.get_text()
.. but of course that wouldn't work. Any help?
I would avoid hard coding the player id as it may not always follow that same pattern. What I would do is pull in the player names ad Ids (since the site provides it), then using something like fuzzywuzzy to match player name input (in case for typos and what not.
Once you get that, it's just a matter of pulling out the specific <td> tage with the chosen data-stat
from bs4 import BeautifulSoup
import requests
import pandas as pd
#pip install fuzzywuzzy
from fuzzywuzzy import process
#pip install choice
import choice
def askname():
playerNameInput = input(str("Enter the player's name -> "))
return playerNameInput
# Get all player IDs
player_df = pd.read_csv('https://www.basketball-reference.com/short/inc/sup_players_search_list.csv', header=None)
player_df = player_df.rename(columns={0:'id',
1:'playerName',
2:'years'})
playersList = list(player_df['playerName'])
# asks user for player name
playerNameInput = askname()
# Find closest matches
search_match = pd.DataFrame(process.extract(f'{playerNameInput}', playersList))
search_match = search_match.rename(columns={0:'playerName',1:'matchScore'})
matches = pd.merge(search_match, player_df, how='inner', on='playerName').drop_duplicates().reset_index(drop=True)
choices = [': '.join(x) for x in list(zip(matches['playerName'], matches['years']))]
# Choice the match
playerChoice = choice.Menu(choices).ask()
playerName, years = playerChoice.split(': ')
# Get that match players id
match = player_df[(player_df['playerName'] == playerName) & (player_df['years'] == years)]
baseUrl = 'https://www.basketball-reference.com/players'
playerId = match.iloc[0]['id']
url = f'{baseUrl}/{playerId[0]}/{playerId}.html'
html = requests.get(url).text.replace('<!--', '').replace('-->', '')
soup = BeautifulSoup(html, 'html.parser')
statList = ['fga_per_mp', 'fg3_per_mp', 'ft_per_mp', 'random']
for stat in statList:
try:
statTd = soup.find('td', {'data-stat':stat})
print(statTd['data-stat'], statTd.text)
except:
print(f'{stat} stat not found')
I've created a script to fetch all the conversation between different debaters excluding moderators. What I've written so far can fetch the total conversation. However, I would like to grab them like {speaker_name: (first speech, second speech) etc }.
Webpage link
another one similar to the above link
webpage link
I've tried so far:
import requests
from bs4 import BeautifulSoup
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".field-docs-content p:has( > strong:contains('MODERATOR:')) ~ p"):
print(item.text)
if __name__ == '__main__':
get_links(url)
How can I scrape the conversation among debaters and put them in a dictionary?
I don't hold much hope for this lasting across lots of pages given the variability amongst the two pages I saw and the number of assumptions I have had to make. Essentially, I use regex on participant and moderators nodes text to isolate the lists of moderators and participants. I then loop all speech paragraphs and each time I encounter a moderator at the start of a paragraph I set a boolean variable store_paragraph = False and ignore subsequent paragraphs; likewise, each time I encounter a participant, I set store_paragraph = True and store that paragraph and subsequent ones under the appropriate participant key in my speaker_dict. I store each speaker_dict in a final results dictionary.
import requests, re
from bs4 import BeautifulSoup as bs
import pprint
links = ['https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas','https://www.presidency.ucsb.edu/documents/republican-presidential-candidates-debate-manchester-new-hampshire-0']
results = {}
p = re.compile(r'\b(\w+)\b\s+\(|\b(\w+)\b,')
with requests.Session() as s:
for number, link in enumerate(links):
r = s.get(link)
soup = bs(r.content,'lxml')
participants_tag = soup.select_one('p:has(strong:contains("PARTICIPANTS:"))')
if participants_tag.select_one('strong'):
participants_tag.strong.decompose()
speaker_dict = {i[0].upper() + ':' if i[0] else i[1].upper() + ':': [] for string in participants_tag.stripped_strings for i in p.findall(string)}
# print(speaker_dict)
moderator_data = [string for string in soup.select_one('p:has(strong:contains("MODERATOR:","MODERATORS:"))').stripped_strings][1:]
#print(moderator_data)
moderators = [i[0].upper() + ':' if i[0] else i[1].upper() + ':' for string in moderator_data for i in p.findall(string)]
store_paragraph = False
for paragraph in soup.select('.field-docs-content p:not(p:contains("PARTICIPANTS:","MODERATOR:"))')[1:]:
string_to_compare = paragraph.text.split(':')[0] + ':'
string_to_compare = string_to_compare.upper()
if string_to_compare in moderators:
store_paragraph = False
elif string_to_compare in speaker_dict:
speaker = string_to_compare
store_paragraph = True
if store_paragraph:
speaker_dict[speaker].append(paragraph.text)
results[number] = speaker_dict
pprint.pprint(results[1])
So I am trying to code a web crawler that goes into a each chapter of a title for a Statue and count occurrence of a set a key words ("shall" "must") in its content.
Below is the code i used to acquire links to each chapters.
The base URL I used is http://law.justia.com/codes/georgia/2015/
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
from collections import Counter
pattern1 = re.compile(r"\bshall\b",re.IGNORECASE)
pattern2 = re.compile(r"\bmust\b",re.IGNORECASE)
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
href = "\n" + str(href)
print (href)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content), "html.parser", parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
# chapterlinks = "\n" + str(chapterlinks)
#print (chapterlinks)
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content), "html.parser", parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title-43' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
###########################################################################
base_url = "http://law.justia.com/codes/georgia/2015/"
levelone(base_url)
The problem is that the structure of the page are usually title - chapter - sections - contents ( ex: http://law.justia.com/codes/georgia/2015/title-43/chapter-1/section-43-1-1/)
But there are ones that are title - chapter - articles - sections - contents (ex.http://law.justia.com/codes/georgia/2015/title-43/chapter-4/article-1/section-43-4-1/ )
I am able to get the links for the first scenario. However, I will miss all the title- chapter - article - sections - contents
My questions is, how can I code this so that I will be able get contents for each chapter (from sections links and from article to sections links) then look for the occurrence of words (such as "shall" or "must") for each chapter individually?
I want to find the word frequency by chapters, hopefully the output will be something like this
chapter 1
Word Frequency
shall 35
must 3
chapter 2
Word Frequency
shall 59
must 14
for this problem, count '/' in urls
http://law.justia.com/codes/georgia/2015/title-43/chapter-1/section-43-1-1/)
http://law.justia.com/codes/georgia/2015/title-43/chapter-4/article-1/section-43-4-1/ )
if url.count('/') == 9:
# do somthing
if url.count('/') == 10:
# do somthing
or you can do a simple trick:
part = url.split('/')
title = part[7]
chapter = part[8]
section = part[-1]
Note: -1 means last part
to count shall or must:
use count function for the same
shall_count = response_text.count('shall')
must_count = response_text.count('must')
I would like the user to give search type choices , search text and then display all the links in the resulting webpage . but i am not able to retrieve the resulting links(only home link is retrieved) from the webpage (http://djraag.net/bollywood/searchbycat.php)
from bs4 import BeautifulSoup
import urllib.request
import re
print(" 1 : Album \n 2 : Track \n 3 : Artist \n ")
count = 0
while (count == 0):
temp_str = input('Enter yout search type : ')
temp = int(temp_str)
if (temp >= 1)&(temp <= 3):
search_no = temp
count = 1
else :
print("Invalid Input")
if search_no == 1 :
search_type1 = "album"
search_type = str(search_type1)
elif search_no == 2:
search_type1 = "track"
search_type = str(search_type1)
else:
search_type1 = "artist"
search_type = str(search_type1)
Search = input("Search : ")
url_temp = "http://djraag.net/bollywood/searchbycat.php?search="+Search+"&type="+search_type+"&cat_id=5&submit=Submit"
url = urllib.request.urlopen(url_temp)
content = url.read()
soup = BeautifulSoup(content ,"html.parser")
for a in soup.findAll('a',href=True):
if re.findall('http',a['href']):
print ("URL:", a['href'])
Remove the line
if re.findall('http',a['href']):
from code and try again.
I am new to coding in python (maybe a couple of days in) and basically learning of other people's code on stackoverflow. The code I am trying to write uses beautifulsoup to get the pid and the corresponding price for motorcycles on craigslist. I know there are many other ways of doing this but my current code looks like this:
from bs4 import BeautifulSoup
from urllib2 import urlopen
u = ""
count = 0
while (count < 9):
site = "http://sfbay.craigslist.org/mca/" + str(u)
html = urlopen(site)
soup = BeautifulSoup(html)
postings = soup('p',{"class":"row"})
f = open("pid.txt", "a")
for post in postings:
x = post.getText()
y = post['data-pid']
prices = post.findAll("span", {"class":"itempp"})
if prices == "":
w = 0
else:
z = str(prices)
z = z[:-8]
w = z[24:]
filewrite = str(count) + " " + str(y) + " " +str(w) + '\n'
print y
print w
f.write(filewrite)
count = count + 1
index = 100 * count
print "index is" + str(index)
u = "index" + str(index) + ".html"
It works fine and as I keep learning i plan to optimize it. The problem I have right now, is that entries without price are still showing up. Is there something obvious that I am missing.
thanks.
The problem is how you're comparing prices. You say:
prices = post.findAll("span", {"class":"itempp"})
In BS .findAll returns a list of elements. When you're comparing price to an empty string, it will always return false.
>>>[] == ""
False
Change if prices == "": to if prices == [] and everything should be fine.
I hope this helps.