How to iterate over URLs for writing CSV - python

I would like to have the information captured in the web by different URLs (I have them in a list called "cod") written to a CSV file, row by row (for export to Excel).
I have tried with just one link, but if I want to do it with all the elements of the List, I'd need to iterate, and am having difficulty.
My code:
import urllib
from bs4 import BeautifulSoup
import csv
urlfixed = "http://www.fatm.com.es/Datos_Equipo.asp?"
cod = ["01GR0001","01GR0004","03GR0006","02GR0003","01GR0030","01GR0018","04GR0007","03GR0032","01AL0001","02AL0003"]
loong = len(cod)
i = 0
sock = urllib.urlopen(urlfixed + "Cod=" + cod[i])
htmlSource = sock.read()
sock.close()
soup = BeautifulSoup(htmlSource)
form = soup.find("form", {'id': "FORM1"})
valores = [item.get('value') for item in form.find_all('input')]
valores.remove('Imprimir')
valores.remove('Cerrar')
values = valores
out = open('tomate.csv', 'w')
w = csv.writer(out)
w.writerow([s.encode("utf-8") for s in values])
out.close()
So, one row with the info from one "cod", and that should makes 10 lines in the "tomate.csv".

Just use a for loop with the iterator iterating through the list cod and you are opening the file for writing when it should have been append :
urlfixed = "http://www.fatm.com.es/Datos_Equipo.asp?"
cod = ["01GR0001","01GR0004","03GR0006","02GR0003","01GR0030","01GR0018","04GR0007","03GR0032","01AL0001","02AL0003"]
for i in cod:
sock = urllib.urlopen(urlfixed + "Cod=" + i)
htmlSource = sock.read()
sock.close()
soup = BeautifulSoup(htmlSource)
form = soup.find("form", {'id': "FORM1"})
valores = [item.get('value') for item in form.find_all('input')]
valores.remove('Imprimir')
valores.remove('Cerrar')
values = valores
out = open('tomate.csv', 'ab')
w = csv.writer(out)
w.writerow([s.encode("utf-8") for s in values])
out.close()
#the loop ends here

Related

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

I'm trying to scrape a website in Python, I got the links to print but in trying to make them a set to deduplicate, there are still duplicates. Anyone have any advice on what I am doing wrong? Thanks in advance!
Edit: So I tried what John suggested but my csv output is a cascading list of links across the excel sheet, it's crazy...I'll post the changes below this original code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ' ', lineterminator = '\r')
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
''
elif hrefs.startswith('http'):
MySet = set()
MySet.add(hrefs)
elif hrefs.startswith('#'):
''
elif hrefs.startswith(' '):
''
print(set(MySet))
file.write(str(MySet)+'\n')
file.close
#Edited code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink))
# The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ',', lineterminator = '\r')
MySet = set()
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
continue
elif hrefs.startswith('#'):
continue
elif hrefs.startswith(' '):
continue
elif hrefs.startswith('http'):
MySet.add(hrefs)
file.write(str(MySet)+'\n')
file.close
print(str(MySet) +'\n')
to get unique links, you want to check if the link is in MySet with hrefs not in MySet.
for simple operation you don't need csv, to write in single row
"\n".join(MySet)
and to write single column
",".join(MySet)
MySet = set()
for link in mylink:
hrefs = link.get('href')
if not hrefs or hrefs.startswith('#'):
continue
# normalize link
if hrefs.startswith('/'):
hrefs = 'https://www.census.gov' + hrefs
# check if link already in MySet
if hrefs not in MySet:
MySet.add(hrefs)
with open('census_links.csv', 'w', newline='') as f:
f.write("\n".join(MySet))
print("\n".join(MySet))
Initialize the set before the loop, and wait to print it until after the loop is done.
MySet = set()
...
for link in mylink:
hrefs = str(link.get('href'))
...
if hrefs.startswith('http'):
MySet.add(hrefs)
...
print(MySet)
same code part to get content.
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
use pandas to get the unique url which starts with http.
import pandas as pd
obj = pd.Series(mylink)
obj_link = obj.map(lambda x: x.get('href')).drop_duplicates().dropna()
cond = obj_link.str.startswith('http')
dfn = obj_link.loc[cond].to_frame()
dfn.shape # (93, 1)
dfn.to_csv('census_links.csv', index=False, header=False)

CSV -(excel)- Python. Seems like wrong writing on csv from python

I´m trying to export some data from a website and I first tried on one single page. I´ve to import text delimited by titles:
['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature
References','Additional Information','Approval Date','Date Created','Company Name']
The url is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus
The code currently works, it gives me all the data. But when I insert it on the CSV , the information is not delimited as I wish.
As it is one single page, the excel should have ONE row... but it doesn´t
The code:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('Drugs.csv','w')
csv_writer = csv.writer(csv_file, delimiter ='+')
csv_writer.writerow(['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature References','Additional Information','Approval Date','Date Created','Company Name'])
link = requests.get('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus')
aux =[]
soup = BeautifulSoup(link.content, 'lxml')
drugName = soup.find('div', class_='company-navigation').find('h1').text
gralInfo = soup.find('div', class_='body directory-listing-profile__description')
y = 0
for h2 in gralInfo.find_all('h2'):
print (y)
text =''
for sibling in h2.find_next_siblings():
if (sibling.name == 'h2'):
break
else:
text = text + sibling.get_text(separator ='\n') + '\n'
print(text)
aux.append(text)
print()
print()
y = y + 1
auxi = []
for info in soup.find_all('div', class_='contact directory-listing-profile__master-detail'):
print(info.text)
auxi.append(info.text)
csv_writer.writerow([drugName, aux[0], aux[1], aux[2], aux[3], aux[4], aux[5], auxi[0], auxi[1], auxi[2]])

Beautiful Soup - Results to CSV for all items in lists

The below snippet "works" but is only outputting the first record to the CSV. I'm trying to get it to output the same output, but for each gun in the list of gun urls in the all_links list.
Any modification i've made to it with prints for the output (just to see it working) prints the same result
or if i make a gun_details list and try to print it, get the same one item output.
How would i go about printing all the gun_details labels and spans into a CSV?
import csv
import urllib.request
import requests
from bs4 import BeautifulSoup
all_links = []
url = "https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1, 3):
res = requests.get(url).text
soup = BeautifulSoup(res, "html.parser")
for link in soup.select(
'a[href*="dealers/minsterley/minsterley-ranges/guns/shotguns/"]'
):
all_links.append("https://www.guntrader.uk" + link["href"])
for a_link in all_links:
gun_label = []
gun_span = []
res = urllib.request.urlopen(a_link)
# res = requests.get(a_link)
soup = BeautifulSoup(res, "html.parser")
for gun_details in soup.select("div.gunDetails"):
for l in gun_details.select("label"):
gun_label.append(l.text.replace(":", ""))
for s in gun_details.select("span"):
gun_span.append(s.text)
my_dict = dict(zip(gun_label, gun_span))
with open("mycsvfile.csv", "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")
Try running the middle section this way:
for a_link in all_links:
gun_label = []
gun_span = []
res = requests.get(a_link)
soup = bs(res.content, 'html.parser') #note it's 'res.content', not just 'res'
for gun_details in soup.select('div.gunDetails'):
for l in gun_details.select('label'):
gun_label.append(l.text.replace(':',''))
for s in gun_details.select('span'):
gun_span.append(s.text)
#this block is now indented differently - it's INSIDE the 'for' loop
my_dict = dict(zip(gun_label, gun_span))
with open('mycsvfile.csv', 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")

Creating a text-delimited file from HTML tables using BeautifulSoup

I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.

Parse HTML table data to JSON and save to text file in Python 2.7

I'm trying to extract the data on the crime rate across states from
this webpage, link to web page
http://www.disastercenter.com/crime/uscrime.htm
I am able to get this into text file. But I would like to get the
response in Json format. How can I do this in python.
Here is my code:
import urllib
import re
from bs4 import BeautifulSoup
link = "http://www.disastercenter.com/crime/uscrime.htm"
f = urllib.urlopen(link)
myfile = f.read()
soup = BeautifulSoup(myfile)
soup1=soup.find('table', width="100%")
soup3=str(soup1)
result = re.sub("<.*?>", "", soup3)
print(result)
output=open("output.txt","w")
output.write(result)
output.close()
The following code will get the data from the two tables and output all of it as a json formatted string.
Working Example (Python 2.7.9):
from lxml import html
import requests
import re as regular_expression
import json
page = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = html.fromstring(page.text)
tables = [tree.xpath('//table/tbody/tr[2]/td/center/center/font/table/tbody'),
tree.xpath('//table/tbody/tr[5]/td/center/center/font/table/tbody')]
tabs = []
for table in tables:
tab = []
for row in table:
for col in row:
var = col.text_content()
var = var.strip().replace(" ", "")
var = var.split('\n')
if regular_expression.match('^\d{4}$', var[0].strip()):
tab_row = {}
tab_row["Year"] = var[0].strip()
tab_row["Population"] = var[1].strip()
tab_row["Total"] = var[2].strip()
tab_row["Violent"] = var[3].strip()
tab_row["Property"] = var[4].strip()
tab_row["Murder"] = var[5].strip()
tab_row["Forcible_Rape"] = var[6].strip()
tab_row["Robbery"] = var[7].strip()
tab_row["Aggravated_Assault"] = var[8].strip()
tab_row["Burglary"] = var[9].strip()
tab_row["Larceny_Theft"] = var[10].strip()
tab_row["Vehicle_Theft"] = var[11].strip()
tab.append(tab_row)
tabs.append(tab)
json_data = json.dumps(tabs)
output = open("output.txt", "w")
output.write(json_data)
output.close()
This might be what you want, if you can use the requests and lxml modules. The data structure presented here is very simple, adjust this to your needs.
First, get a response from your requested URL and parse the result into an HTML tree:
import requests
from lxml import etree
import json
response = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = etree.HTML(response.text)
Assuming you want to extract both tables, create this XPath and unpack the results. totals is "Number of Crimes" and rates is "Rate of Crime per 100,000 People":
xpath = './/table[#width="100%"][#style="background-color: rgb(255, 255, 255);"]//tbody'
totals, rates = tree.findall(xpath)
Extract the raw data (td.find('./') means first child item, whatever tag it has) and clean the strings (r'' raw strings are needed for Python 2.x):
raw_data = []
for tbody in totals, rates:
rows = []
for tr in tbody.getchildren():
row = []
for td in tr.getchildren():
child = td.find('./')
if child is not None and child.tag != 'br':
row.append(child.text.strip(r'\xa0').strip(r'\n').strip())
else:
row.append('')
rows.append(row)
raw_data.append(rows)
Zip together the table headers in the first two rows, then delete the redundant rows, seen as the 11th & 12th steps in slice notation:
data = {}
data['tags'] = [tag0 + tag1 for tag0, tag1 in zip(raw_data[0][0], raw_data[0][1])]
for raw in raw_data:
del raw[::12]
del raw[::11]
Store the rest of the raw data and create a JSON file (optional: eliminate whitespace with separators=(',', ':')):
data['totals'], data['rates'] = raw_data[0], raw_data[1]
with open('data.json', 'w') as f:
json.dump(data, f, separators=(',', ':'))

Categories

Resources