Loop while dynamically scraping - Python - python

I want to try to make realtime scrapes which have separate intervals. For example, the last data I scrape is T = 1 then it will loop once every 6 hours, T = 2 then it will loop 1 hour once and T = 3 then it will loop once every 1 minute.
But after I thought about the logic, I was confused about how to implement it.
where I make T as a reference, here is an example of the data.
[1]: https://i.stack.imgur.com/H427J.png
I will try to share the code snippet that I made.
headers = ["Year", "Month", "Day", "Hour", "Minute", "Second", "T", "Height"]
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas = []
dt = soup.find_all('textarea')[0].text
datas = dt.split('\n')[2:-1]
#membaca scrape to to array dan membaca data ke6
arr = []
arr = np.array([datas])
def listToString(s):
str1 = ""
for ele in s:
str1 += ele
return str1
coba = []
for item_list in arr:
item_string = listToString(item_list)
coba.append(item_string.split()[6])
print(coba)
#-----------------------------------------------
#perulangan interval data T
while True:
if coba[0] == 1:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break
if coba[0] == 2:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break
if coba[0] == 3:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break

Related

Write to dictionary of lists to a CSV file

I want multiple values to be associated while it displays vertically. When I write the value key of my dictionary to writerows (w.writerows(d.values()) it goes horizontally, while I want it vertically.
from bs4 import BeautifulSoup
import csv
r = requests.get('https://www.ufc.com/rankings')
s = BeautifulSoup(r.text, 'lxml')
fighters = s.find_all('div','view-grouping')
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all('a')]
name.append(z)
divisions = x.find('h4')
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions,name))
print(d)
with open('ufc.csv', 'w', newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(d.keys())
w.writerows(d.values())
Try:
import csv
from bs4 import BeautifulSoup
r = requests.get("https://www.ufc.com/rankings")
s = BeautifulSoup(r.text, "lxml")
fighters = s.find_all("div", "view-grouping")
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all("a")]
name.append(z)
divisions = x.find("h4")
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions, name))
with open("ufc.csv", "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(d.keys())
for column in d:
d[column] = iter(d[column])
while True:
row = [next(d[column], "") for column in d]
if all(val == "" for val in row):
break
w.writerow(row)
This saves ufc.csv correctly (screenshot from LibreOffice):

Multi threading not processing full list

I am using multi-threading to visit links read from a csv, strangely irrespective of the max-workers or even when I remove the multi-threading part, the code runs for an arbitrarily lower number of urls than in the list. I print the list to verify the count. For e.g if the list has 5000 urls, the code stops at 4084, if the links are 13,000 it will stop at 9200, even when it is just 130 links it will stop at 80 or something. What am I doing wrong here?
import requests
import xlrd
import concurrent.futures
from bs4 import BeautifulSoup
import csv
header_added = False
file_location = "Urls.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row,0))
print(len(all_links))
i = 0
def get_solution(url):
global header_added, i
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
ques_div = soup.find('p', class_='header-description')
ques = ques_div.find('span').text
ans_divs = soup.findAll('div', class_='puzzle-solution')
ans = ans_divs[0].text
print("Solution ", i)
i += 1
dict1 ={"Words": ques, "Solution": ans}
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(get_solution, all_links)
Here's a reworking of your code that doesn't need locks – instead, there's only ever one process that writes to the file.
Also, due to the GIL, using a ThreadPool will be slower than a process-backed Pool.
import csv
import multiprocessing
import requests
import xlrd
from bs4 import BeautifulSoup
sess = requests.Session()
def get_solution(url):
try:
resp = sess.get(url)
resp.raise_for_status()
page = resp.text
soup = BeautifulSoup(page, "html.parser")
ques_div = soup.find("p", class_="header-description")
ques = ques_div.find("span").text.strip()
ans_divs = soup.findAll("div", class_="puzzle-solution")
ans = ans_divs[0].text.strip()
return {"URL": url, "Words": ques, "Solution": ans, "Error": ""}
except Exception as exc:
print(url, "Error:", exc)
return {"URL": url, "Words": "", "Solution": "", "Error": str(exc)}
def read_links(file_location):
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row, 0))
return all_links
def main():
links = read_links("./Urls.xlsx")
with open("Results10k.csv", "w", encoding="utf-8") as f:
with multiprocessing.Pool() as p: # (or multiprocessing.pool.ThreadPool)
for i, result in enumerate(p.imap_unordered(get_solution, links, chunksize=16)):
if i == 0:
writer = csv.DictWriter(f, result.keys())
writer.writeheader()
writer.writerow(result)
f.flush() # Ensure changes are written immediately
if i % 100 == 0: # Progress indicator
print(i)
if __name__ == "__main__":
main()
It could be, that get_solution() crashes for some of the URLs. You could add a try/except in the body of the function and write all crashed URLS to a different file.
def get_solution(url):
try:
...
except:
with open('errors.txt','a+') as f:
f.write(url+'\n')
If this is the problem the numbers should add up to the total number.
Also open() is probably not thread safe.
file_lock = threading.Lock()
def get_solution(url):
with file_lock:
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
...

Try, except, and finally statements

So I am developing a nitro type bot and I don't see a reason I should be getting an error but around 20 races in it is going directly to sending me an email. Am I use the statements wrong or what I am new to python so if it is really simple and a dumb mistake plz be nice and if it is something you could easily find on the web im really sorry.
try:
time.sleep(4)
driver.get('https://www.nitrotype.com/garage')
driver.implicitly_wait(20)
driver.find_element_by_css_selector('a.btn--light:nth-child(2)').click()
time.sleep(5)
driver.find_element_by_css_selector('button.btn--primary').click()
driver.implicitly_wait(10)
driver.find_element_by_css_selector('.dash-copyContainer')
time.sleep(4.25)
html = driver.page_source.replace(' ', ' ')
f = open("word.html", "w")
f.write(html)
f.close()
with open("word.html", "r") as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
words = soup.find_all('span', class_='dash-letter')
stuff = ""
for span in words:
if span.text.isascii():
stuff += span.text
with open("Sentence.txt", "w") as wf:
wf.write(stuff)
wf.close()
e = open('Sentence.txt', 'r')
s = e.read()
Words = (s.split())
Delay = ((len(s.split()) / WPM) * 60)
int(Delay)
Delay1 = Delay / len(s.split())
for Word in Words:
pyautogui.typewrite(Word + " ")
time.sleep(Delay1)
time.sleep(2)
driver.get('https://www.nitrotype.com/garage')
except:
time.sleep(4)
driver.get('https://www.nitrotype.com/garage')
driver.implicitly_wait(20)
driver.find_element_by_css_selector('a.btn--light:nth-child(2)').click()
time.sleep(5)
driver.find_element_by_css_selector('button.btn--primary').click()
driver.implicitly_wait(10)
driver.find_element_by_css_selector('.dash-copyContainer')
time.sleep(4.25)
html = driver.page_source.replace(' ', ' ')
f = open("word.html", "w")
f.write(html)
f.close()
with open("word.html", "r") as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
words = soup.find_all('span', class_='dash-letter')
stuff = ""
for span in words:
if span.text.isascii():
stuff += span.text
with open("Sentence.txt", "w") as wf:
wf.write(stuff)
wf.close()
e = open('Sentence.txt', 'r')
s = e.read()
Words = (s.split())
Delay = ((len(s.split()) / WPM) * 60)
int(Delay)
Delay1 = Delay / len(s.split())
for Word in Words:
pyautogui.typewrite(Word + " ")
time.sleep(Delay1)
time.sleep(2)
driver.get('https://www.nitrotype.com/garage')
finally:
driver1 = webdriver.Chrome(executable_path='/Users/Braeden/Downloads/chromedriver.exe')
driver1.get('https://accounts.google.com/ServiceLogin/signinchooser?service=mail&passive=true&rm=false&continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&ss=1&scc=1&ltmpl=default&ltmplcache=2&emr=1&osid=1&flowName=GlifWebSignIn&flowEntry=ServiceLogin')
time.sleep(2)
driver1.find_element_by_css_selector('#identifierId')\
.send_keys(EU)
time.sleep(2)
driver1.find_element_by_css_selector('.VfPpkd-vQzf8d').click()
time.sleep(2)
driver1.find_element_by_css_selector('<div class="VfPpkd-RLmnJb"></div>')\
.send_keys(EP)
time.sleep(1)
driver1.find_element_by_css_selector('.VfPpkd-LgbsSe-OWXEXe-k8QpJ > span:nth-child(2)').click()
time.sleep(2)
driver1.find_element_by_css_selector('.VfPpkd-LgbsSe-OWXEXe-k8QpJ > div:nth-child(1)').click()
time.sleep(2)
driver1.find_element_by_css_selector('.T-I-KE').click()
time.sleep(2)
driver1.find_element_by_css_selector('#\:c1')\
.send_keys(TO)
driver1.find_element_by_css_selector('#\:co')\
.send_keys('Nitro type requires Captcha')
driver1.find_element_by_css_selector('#\:b9').click()
driver1.close()
input('Did you complete the captcha:')

Web Scraping through multiple urls

I have the code I would like for the content I need, however i would like to run through all the gameId's that have played so far instead of just the one in the URL. I would like to change 2017020001 and make it go through to 2017021272 or till the end of the season which is around 1272 i believe. How can that be done with the code below?
import csv
import requests
import os
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/2017020001/feed/live?site=en_nhl')
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_2017020001.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
f.close()
If the game ids are numbered sequentially then it would be as simple as nesting all your code under a for loop that iterates through all the game ids and using str.format() add the necessary padding to the number in this case some parts would change:
import csv
import requests
import os
for i in range(1, 1273):
url = 'https://statsapi.web.nhl.com/api/v1/game/201702{:04d}/feed/live?site=en_nhl'.format(i)
req = requests.get(url)
req.raise_for_status()
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_201702{:04d}.csv".format(i), "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
One last correction is that using with ... as makes it so that you don't need to close the file explictily.
You can find additional information on using str.format() here
You should iterate over your code using a for-loop
Something like this should work:
import csv
import requests
import os
for x in range(2017020001, 2017021273):
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/%s/feed/live?site=en_nhl' % x)
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_2017020001.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
f.close()

scraping a table then writing to csv

I am pretty new to python and beautiful soup. this is my first 'real' project. I am trying to scrape some info from a website. So far I have been semi-successful. I have identified the table and got python to print out the relevant information pretty nicely.
I am stuck with writing that information python prints to a usable csv file.
here is what I have for my code. to get python to print the info I need.
for row in table_1.find_all('tr'):
tds = row.find_all('td')
try:
a = str(tds[0].get_text())
b = str(tds[1].get_text())
c = str(tds[2].get_text())
d = str(tds[3].get_text())
e = str(tds[4].get_text())
f = str(tds[5].get_text())
g = str(tds[7].get_text())
print 'User Name:' + a
print 'Source:' + b
print 'Staff:' + c
print 'Location:' + d
print 'Attended On:' + e
print 'Used:' + f
print 'Date:' + g + '\n'
except:
print 'bad string'
continue
Here is a more succinct way to collect your data:
columns = ["User Name", "Source", "Staff", "Location", "Attended On", "Used", "Date"]
table = []
for row in table_1.find_all('tr'):
tds = row.find_all('td')
try:
data = [td.get_text() for td in tds]
for field,value in zip(columns, data):
print("{}: {}".format(field, value))
table.append(data)
except:
print("Bad string value")
and you can then write to csv as
import csv
with open("myfile.csv", "wb") as outf: # Python 2.x
# with open("myfile.csv", "w", newline="") as outf: # Python 3.x
outcsv = csv.writer(outf)
# header row
outcsv.writerow(columns)
# data
outcsv.writerows(table)
You could append a thru g to a list within a list for each iteration of the loop. Then use this:
my_list = []
for row in table_1.find_all('tr'):
tds = row.find_all('td')
a = str(tds[0].get_text())
b = str(tds[1].get_text())
c = str(tds[2].get_text())
d = str(tds[3].get_text())
e = str(tds[4].get_text())
f = str(tds[5].get_text())
g = str(tds[7].get_text())
my_list.append([a,b,c,d,e,f,g])
Then:
import csv
with open('output_table.csv', 'wb') as csvfile:
wr= csv.writer(csvfile,lineterminator = '\n')
wr.writerows(my_list)

Categories

Resources