Web Scraping through multiple urls

Web Scraping through multiple urls - python

I have the code I would like for the content I need, however i would like to run through all the gameId's that have played so far instead of just the one in the URL. I would like to change 2017020001 and make it go through to 2017021272 or till the end of the season which is around 1272 i believe. How can that be done with the code below?
import csv
import requests
import os
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/2017020001/feed/live?site=en_nhl')
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_2017020001.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
f.close()

If the game ids are numbered sequentially then it would be as simple as nesting all your code under a for loop that iterates through all the game ids and using str.format() add the necessary padding to the number in this case some parts would change:
import csv
import requests
import os
for i in range(1, 1273):
url = 'https://statsapi.web.nhl.com/api/v1/game/201702{:04d}/feed/live?site=en_nhl'.format(i)
req = requests.get(url)
req.raise_for_status()
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_201702{:04d}.csv".format(i), "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
One last correction is that using with ... as makes it so that you don't need to close the file explictily.
You can find additional information on using str.format() here

You should iterate over your code using a for-loop
Something like this should work:
import csv
import requests
import os
for x in range(2017020001, 2017021273):
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/%s/feed/live?site=en_nhl' % x)
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
time = item['about']['periodTime']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, time, triCode, coordinates_x, coordinates_y])
headers = ["pk", "player_a", "player_b", "event", "time", "triCode", "coordinates_x", "coordinates_y"]
with open("NHL_2017020001.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
f.close()

Related

Write to dictionary of lists to a CSV file

I want multiple values to be associated while it displays vertically. When I write the value key of my dictionary to writerows (w.writerows(d.values()) it goes horizontally, while I want it vertically.
from bs4 import BeautifulSoup
import csv
r = requests.get('https://www.ufc.com/rankings')
s = BeautifulSoup(r.text, 'lxml')
fighters = s.find_all('div','view-grouping')
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all('a')]
name.append(z)
divisions = x.find('h4')
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions,name))
print(d)
with open('ufc.csv', 'w', newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(d.keys())
w.writerows(d.values())

Try:
import csv
from bs4 import BeautifulSoup
r = requests.get("https://www.ufc.com/rankings")
s = BeautifulSoup(r.text, "lxml")
fighters = s.find_all("div", "view-grouping")
name = []
weightdivisions = []
for x in fighters:
z = [names.string for names in x.find_all("a")]
name.append(z)
divisions = x.find("h4")
dd = divisions.text
weightdivisions.append(dd)
d = dict(zip(weightdivisions, name))
with open("ufc.csv", "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(d.keys())
for column in d:
d[column] = iter(d[column])
while True:
row = [next(d[column], "") for column in d]
if all(val == "" for val in row):
break
w.writerow(row)
This saves ufc.csv correctly (screenshot from LibreOffice):

Multi threading not processing full list

I am using multi-threading to visit links read from a csv, strangely irrespective of the max-workers or even when I remove the multi-threading part, the code runs for an arbitrarily lower number of urls than in the list. I print the list to verify the count. For e.g if the list has 5000 urls, the code stops at 4084, if the links are 13,000 it will stop at 9200, even when it is just 130 links it will stop at 80 or something. What am I doing wrong here?
import requests
import xlrd
import concurrent.futures
from bs4 import BeautifulSoup
import csv
header_added = False
file_location = "Urls.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row,0))
print(len(all_links))
i = 0
def get_solution(url):
global header_added, i
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
ques_div = soup.find('p', class_='header-description')
ques = ques_div.find('span').text
ans_divs = soup.findAll('div', class_='puzzle-solution')
ans = ans_divs[0].text
print("Solution ", i)
i += 1
dict1 ={"Words": ques, "Solution": ans}
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(get_solution, all_links)

Here's a reworking of your code that doesn't need locks – instead, there's only ever one process that writes to the file.
Also, due to the GIL, using a ThreadPool will be slower than a process-backed Pool.
import csv
import multiprocessing
import requests
import xlrd
from bs4 import BeautifulSoup
sess = requests.Session()
def get_solution(url):
try:
resp = sess.get(url)
resp.raise_for_status()
page = resp.text
soup = BeautifulSoup(page, "html.parser")
ques_div = soup.find("p", class_="header-description")
ques = ques_div.find("span").text.strip()
ans_divs = soup.findAll("div", class_="puzzle-solution")
ans = ans_divs[0].text.strip()
return {"URL": url, "Words": ques, "Solution": ans, "Error": ""}
except Exception as exc:
print(url, "Error:", exc)
return {"URL": url, "Words": "", "Solution": "", "Error": str(exc)}
def read_links(file_location):
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row, 0))
return all_links
def main():
links = read_links("./Urls.xlsx")
with open("Results10k.csv", "w", encoding="utf-8") as f:
with multiprocessing.Pool() as p: # (or multiprocessing.pool.ThreadPool)
for i, result in enumerate(p.imap_unordered(get_solution, links, chunksize=16)):
if i == 0:
writer = csv.DictWriter(f, result.keys())
writer.writeheader()
writer.writerow(result)
f.flush() # Ensure changes are written immediately
if i % 100 == 0: # Progress indicator
print(i)
if __name__ == "__main__":
main()

It could be, that get_solution() crashes for some of the URLs. You could add a try/except in the body of the function and write all crashed URLS to a different file.
def get_solution(url):
try:
...
except:
with open('errors.txt','a+') as f:
f.write(url+'\n')
If this is the problem the numbers should add up to the total number.
Also open() is probably not thread safe.
file_lock = threading.Lock()
def get_solution(url):
with file_lock:
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
...

Loop while dynamically scraping - Python

I want to try to make realtime scrapes which have separate intervals. For example, the last data I scrape is T = 1 then it will loop once every 6 hours, T = 2 then it will loop 1 hour once and T = 3 then it will loop once every 1 minute.
But after I thought about the logic, I was confused about how to implement it.
where I make T as a reference, here is an example of the data.
[1]: https://i.stack.imgur.com/H427J.png
I will try to share the code snippet that I made.
headers = ["Year", "Month", "Day", "Hour", "Minute", "Second", "T", "Height"]
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas = []
dt = soup.find_all('textarea')[0].text
datas = dt.split('\n')[2:-1]
#membaca scrape to to array dan membaca data ke6
arr = []
arr = np.array([datas])
def listToString(s):
str1 = ""
for ele in s:
str1 += ele
return str1
coba = []
for item_list in arr:
item_string = listToString(item_list)
coba.append(item_string.split()[6])
print(coba)
#-----------------------------------------------
#perulangan interval data T
while True:
if coba[0] == 1:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break
if coba[0] == 2:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break
if coba[0] == 3:
while True:
page = requests.get('https://www.ndbc.noaa.gov/station_page.php?station=52406')
soup = BeautifulSoup(page.text, 'html.parser')
datas1 = []
dt = soup.find_all('textarea')[0].text
datas1 = dt.split('\n')[2:-1]
with open("52406.csv", "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(headers)
for line in soup.select_one("#data").text.split("\n"):
if re.fullmatch(r"[\d. ]{30}", line) and len(line.split()) == len(headers):
writer.writerow(line.split())
print('Data 1')
addDate()
insertSQL()
time.sleep(3600)
break

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).

Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.

most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Write variable output to a specific column in a CSV?

I'm working on a Python script that scrapes data from an Excel doc, then writes the output to a .csv.
I was able to grab the data and get it to write to the .csv, but all of the data goes into the first column.
I need the bar data to go into the 4th and the foo to go into the 5th column, so I tried to use csv.reader to select the row, and this runs without error but doesn't actually write to the .csv file.
Here's my code:
import xlrd
import csv
###Grab the data
def get_row_values(workSheet, row):
to_return = []
num_cells = myWorksheet.ncols - 1
curr_cell = -1
while curr_cell < num_cells:
curr_cell += 1
cell_value = myWorksheet.cell_value(row, curr_cell)
to_return.append(cell_value)
return to_return
file_path = 'map_test.xlsx'
output = []
output_bar = []
output_foo = []
myWorkbook = xlrd.open_workbook(file_path)
myWorksheet = myWorkbook.sheet_by_name('Sheet1')
num_rows = myWorksheet.nrows - 1
curr_row = 0
column_names = get_row_values(myWorksheet, curr_row)
print len(column_names)
while curr_row < num_rows:
curr_row += 1
row = myWorksheet.row(curr_row)
this_row = get_row_values(myWorksheet, curr_row)
x = 0
while x <len(this_row):
if this_row[x] == 'x':
output.append([this_row[0], column_names[x]])
output_bar.append([column_names[x]])
output_foo.append([this_row[0]])
print output
myData = [["number", "name", "version", "bar",
"foo"]]
##### Next section is the code in question, it
####doesn't error out, but won't write to the .csv######
myFile = open("test123.csv", "w")
writer = csv.writer(myFile)
with open('test123.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
row[5] = myFile.readline()
writer.writerows(output_foo)
row[4] = myFile.readline()
writer.writerows(outpu_bar)
#####This successfully writes to the csv, but
#####all data to first column#####
# myFile = open('test123.csv', 'w')
# with myFile:
# writer = csv.writer(myFile)
# writer.writerows(myData)
# #writer.writerows(output)
# writer.writerows(output_foo)
# writer.writerows(output_bar)
x += 1
print ("CSV Written")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web Scraping through multiple urls - python

Related

Write to dictionary of lists to a CSV file

Multi threading not processing full list

Loop while dynamically scraping - Python

Can't figure out how to properly output my data

Write variable output to a specific column in a CSV?

Categories

Resources