List index out of range with web scraping

List index out of range with web scraping - python

my code has the list index out of range error.
import requests
from bs4 import BeautifulSoup
import re
import pyperclip
# import pandas as pd
import csv
# Get a name of the agency
def getAgency(pageURL):
res = requests.get(pageURL)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
elems = soup.select('h3.company-name > a')
names = []
for i in range(len(elems)):
names.append(str(elems[i].text.strip()))
return names
def getWebsite(pageURL):
res = requests.get(pageURL)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
elems = soup.select('li.website-link.website-link-a > a')
sites = []
for elem in elems:
if elem.find('/your-project') != -1:
elems.remove(elem)
else:
pass
for i in range(len(elems)):
sites.append(str(elems[i]["href"]))
return sites
allNames = []
for pagenumber in range(0,1):
names = getAgency('https://clutch.co/agencies/digital?page=' + str(pagenumber))
allNames += names
allSites = []
for pagenumber in range(0,1):
sites = getWebsite('https://clutch.co/agencies/digital?page=' + str(pagenumber))
allSites += sites
final = []
with open('text.csv', 'w', newline='') as f:
a = csv.writer(f, delimiter=',')
for index in range(len(allNames)):
final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
a.writerows(final)
finalresult = "\n".join(final)
pyperclip.copy(finalresult)
I know what causes this error, this if statement.
for elem in elems:
if elem.find('/your-project') != -1:
elems.remove(elem)
else:
pass
When I delete an item from the list, the index number decreases by 1. And so, in this for loop, the index will be the length of the allNames, and I used the same index with the allSites. So, when the index will be the last item of allNames, the allSites will throw an error because it will be out of range. What can I do to solve this problem?
for index in range(len(allNames)):
final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
a.writerows(final)

You need to parse the code in one iteration and spit out pairs of names and sites, and then filter to throw out the pairs.
As your code is right now the lists aren't even the same length before you try to filter them by '/your-project' because there are advertising links that match 'li.website-link.wibise-link-a > a'

Related

Adding Data from Beautiful Soup table to a list

Hello I'm a beginner to python and programming in general, and I was wondering how I would make the outputted data a list. I used bs to extract data from a table and attempt to make a list with the data, but I end up only adding the first number to the list. Can someone provide me assistance and an explaination?
from bs4 import BeautifulSoup
from requests_html import HTMLSession
s = HTMLSession()
url = 'https://www.timeanddate.com/weather/usa/new-york/ext'
def get_data(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
with open('document.txt', 'a') as f:
f.write(str(get_data(url)))
with open('document.txt', 'r') as html_file:
contents = html_file.read()
soup = BeautifulSoup(contents, 'lxml')
forecast_table = soup.find('table', class_ = 'zebra tb-wt fw va-m tb-hover')
wtitle = soup.title.text
print(wtitle)
print("------")
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
print(pl_high_final)
get_weather_high(forecast_table)
This the output. Instead of each line being a number, I want to have it all under on list

Create a list before your for loop and just append your data instead of printing it and then just print the list after the for loop
data = []
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
data.append(pl_high_final)
print(data) # or return data if you need it some where else

I want to input to CSV with Python 3, but I'd like to do it from now on

Premise / What you want to achieve
Obtain data such as stock code and stock name from the Web with Python 3
I'd like to output in CSV, but I'm having a problem because the arrangement is not ideal.
Problems / error messages that are occurring
At present, we have succeeded in arranging the stock code and stock name.
Next, I would like to output to CSV in the following arrangement.
Brand code 1, Brand name 1
Brand code 2, Brand name 2
Brand code 3, Brand name 3
...
...
...
from bs4 import BeautifulSoup
import requests
import csv
# Yahoo!finance
url = "https://info.finance.yahoo.co.jp/ranking/?kd=27&mk=1&tm=d&vl=a"
l = list()
t = list()
num = 1
while num <= 51:
r = requests.get(url)
soup = BeautifulSoup(r.text)
codes = soup.select("#contents-body-bottom > div.rankdata > div.rankingTableWrapper > table > tbody > tr:nth-child(" + str(num) + ") > td:nth-child(1) > a")
if len(codes) == 0:
break
num += 1
for code in codes:
l.append(code.text)
base_url = "https://kabutan.jp/stock/?code="
num = 0
for i in l:
url = base_url + str(l[num])
r = requests.get(url)
soup = BeautifulSoup(r.text)
titles = soup.select("#kobetsu_right > div.company_block > h3")
num += 1
for title in titles:
t.append(title.text)
# csv
with open('data.csv', 'w') as file:
writer = csv.writer(file, lineterminator='\n')
writer.writerow(l)
writer.writerow(t)

CSV with one line using the zip () function in the standard function and the zip_longest () function in itertoolsK
for row in zip(l, t):
writer.writerow(row)

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

I'm trying to scrape a website in Python, I got the links to print but in trying to make them a set to deduplicate, there are still duplicates. Anyone have any advice on what I am doing wrong? Thanks in advance!
Edit: So I tried what John suggested but my csv output is a cascading list of links across the excel sheet, it's crazy...I'll post the changes below this original code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ' ', lineterminator = '\r')
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
''
elif hrefs.startswith('http'):
MySet = set()
MySet.add(hrefs)
elif hrefs.startswith('#'):
''
elif hrefs.startswith(' '):
''
print(set(MySet))
file.write(str(MySet)+'\n')
file.close
#Edited code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink))
# The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ',', lineterminator = '\r')
MySet = set()
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
continue
elif hrefs.startswith('#'):
continue
elif hrefs.startswith(' '):
continue
elif hrefs.startswith('http'):
MySet.add(hrefs)
file.write(str(MySet)+'\n')
file.close
print(str(MySet) +'\n')

to get unique links, you want to check if the link is in MySet with hrefs not in MySet.
for simple operation you don't need csv, to write in single row
"\n".join(MySet)
and to write single column
",".join(MySet)
MySet = set()
for link in mylink:
hrefs = link.get('href')
if not hrefs or hrefs.startswith('#'):
continue
# normalize link
if hrefs.startswith('/'):
hrefs = 'https://www.census.gov' + hrefs
# check if link already in MySet
if hrefs not in MySet:
MySet.add(hrefs)
with open('census_links.csv', 'w', newline='') as f:
f.write("\n".join(MySet))
print("\n".join(MySet))

Initialize the set before the loop, and wait to print it until after the loop is done.
MySet = set()
...
for link in mylink:
hrefs = str(link.get('href'))
...
if hrefs.startswith('http'):
MySet.add(hrefs)
...
print(MySet)

same code part to get content.
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
use pandas to get the unique url which starts with http.
import pandas as pd
obj = pd.Series(mylink)
obj_link = obj.map(lambda x: x.get('href')).drop_duplicates().dropna()
cond = obj_link.str.startswith('http')
dfn = obj_link.loc[cond].to_frame()
dfn.shape # (93, 1)
dfn.to_csv('census_links.csv', index=False, header=False)

Why won't listfunction iterate through urls?

I am trying to write a program that opens a url, finds a name in a certain line, and saves it. Then it should find the url in the same line as the name, open it, and find the name + url in the same line # as the previous page. It should do this 4 times.
I can't get it to iterate through the new url parameter. It keeps returning the same name and url. What is going wrong here? Thanks.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import ssl
linklist = list()
namelist = list()
linelist = list()
count = 0
listposition = int(input("Please enter list position: "))
goodnamelist = list(["Fikret"])
nexturl = "http://py4e-data.dr-chuck.net/known_by_Fikret.html"
def listfunction(url):
ctx = ssl.create_default_context()
#Allows reading of HTTPS pages
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
linelist = soup('a')
for line in linelist:
#Creates list of lines in webpage:
linklist.append(re.findall("(http://.+)\"", str(line)))
#Creates list of names in line:
namelist.append(re.findall(">(.+)</a>", str(line)))
#Creates list of names in the designated user-input position:
goodnamelist.append(namelist[listposition][0])
nexturl = linklist[listposition][0]
return nexturl
while (count < 4):
nexturl = listfunction(nexturl)
print(listfunction(nexturl))
count += 1
print(nexturl)
continue
print(linelist)
print(linklist)
print(namelist)
print(nexturl)
print(goodnamelist)
print(listfunction(nexturl))

You do not actually set nexturl in listfunction(). Therefore the method just returns the same initial global variable every time.

Beautiful Soup - Results to CSV for all items in lists

The below snippet "works" but is only outputting the first record to the CSV. I'm trying to get it to output the same output, but for each gun in the list of gun urls in the all_links list.
Any modification i've made to it with prints for the output (just to see it working) prints the same result
or if i make a gun_details list and try to print it, get the same one item output.
How would i go about printing all the gun_details labels and spans into a CSV?
import csv
import urllib.request
import requests
from bs4 import BeautifulSoup
all_links = []
url = "https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1, 3):
res = requests.get(url).text
soup = BeautifulSoup(res, "html.parser")
for link in soup.select(
'a[href*="dealers/minsterley/minsterley-ranges/guns/shotguns/"]'
):
all_links.append("https://www.guntrader.uk" + link["href"])
for a_link in all_links:
gun_label = []
gun_span = []
res = urllib.request.urlopen(a_link)
# res = requests.get(a_link)
soup = BeautifulSoup(res, "html.parser")
for gun_details in soup.select("div.gunDetails"):
for l in gun_details.select("label"):
gun_label.append(l.text.replace(":", ""))
for s in gun_details.select("span"):
gun_span.append(s.text)
my_dict = dict(zip(gun_label, gun_span))
with open("mycsvfile.csv", "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")

Try running the middle section this way:
for a_link in all_links:
gun_label = []
gun_span = []
res = requests.get(a_link)
soup = bs(res.content, 'html.parser') #note it's 'res.content', not just 'res'
for gun_details in soup.select('div.gunDetails'):
for l in gun_details.select('label'):
gun_label.append(l.text.replace(':',''))
for s in gun_details.select('span'):
gun_span.append(s.text)
#this block is now indented differently - it's INSIDE the 'for' loop
my_dict = dict(zip(gun_label, gun_span))
with open('mycsvfile.csv', 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

List index out of range with web scraping - python

Related

Adding Data from Beautiful Soup table to a list

I want to input to CSV with Python 3, but I'd like to do it from now on

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

Why won't listfunction iterate through urls?

Beautiful Soup - Results to CSV for all items in lists

Categories

Resources