Pull data using regex and insert into a .csv file - python

So I am using regex to pull data from a webpage. Done.
Now I am trying to insert this data into a .csv file. No problem right?
So I am having trouble pulling my data from the loops I created to insert into the .csv file. It looks like the best way to conquer this is to create a list, and somehow insert the data into the list and write the data into the csv file. But how can I do that with my current setup?
import re
import sqlite3 as lite
import mysql.connector
import urllib.request
from bs4 import BeautifulSoup
import csv
#We're pulling info on socks from e-commerce site Aliexpress
url="https://www.aliexpress.com/premium/socks.html?SearchText=socks&ltype=wholesale&d=y&tc=ppc&blanktest=0&initiative_id=SB_20171202125044&origin=y&catId=0&isViewCP=y"
req = urllib.request.urlopen(url)
soup = BeautifulSoup(req, "html.parser")
div = soup.find_all("div", attrs={"class":"item"})
for item in div:
title_pattern = '<img alt="(.*?)\"'
comp = re.compile(title_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
price_pattern = 'itemprop="price">(.*?)<'
comp = re.compile(price_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
seller_pattern = '<a class="store j-p4plog".*?>(.*?)<'
comp = re.compile(seller_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
orders_pattern = '<em title="Total Orders">.*?<'
comp = re.compile(orders_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x[32:-1])
feedback_pattern = '<a class="rate-num j-p4plog".*?>(.*)<'
comp = re.compile(feedback_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
# Creation and insertion of CSV file
# csvfile = "aliexpress.csv"
# csv = open(csvfile, "w")
# columnTitleRow = "Title,Price,Seller,Orders,Feedback,Pair"
# csv.write(columnTitleRow)
#
# for stuff in div:
# title =
# price =
# seller =
# orders =
# feedback =
# row = title + "," + price + "," + seller + "," + orders + "," + feedback +
"," + "\n"
# csv.write(row)
I want to be able to print these lists by their row.

It looks like the best way to conquer this is to create a list, and somehow insert the data into the list and write the data into the csv file. But how can I do that with my current setup?
Yes you're right. Replace your print statements with appends to a list:
data = []
for item in div:
title_pattern = '<img alt="(.*?)\"'
comp = re.compile(title_pattern)
href = re.findall(comp, str(item))
for x in href:
data.append(x)
price_pattern = 'itemprop="price">(.*?)<'
comp = re.compile(price_pattern)
href = re.findall(comp, str(item))
for x in href:
data.append(x)
And then later
csv.writerow(data)
From what I remember, csv.write takes a list and not a rendered CSV string anyways. That's the whole point, it takes the raw data and escapes it properly and adds the commas for you.
Edit: As explained in the comment, I misremembered the interface to csv writer. writerow takes a list, not write. Updated.

Related

CSV -(excel)- Python. Seems like wrong writing on csv from python

I´m trying to export some data from a website and I first tried on one single page. I´ve to import text delimited by titles:
['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature
References','Additional Information','Approval Date','Date Created','Company Name']
The url is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus
The code currently works, it gives me all the data. But when I insert it on the CSV , the information is not delimited as I wish.
As it is one single page, the excel should have ONE row... but it doesn´t
The code:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('Drugs.csv','w')
csv_writer = csv.writer(csv_file, delimiter ='+')
csv_writer.writerow(['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature References','Additional Information','Approval Date','Date Created','Company Name'])
link = requests.get('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus')
aux =[]
soup = BeautifulSoup(link.content, 'lxml')
drugName = soup.find('div', class_='company-navigation').find('h1').text
gralInfo = soup.find('div', class_='body directory-listing-profile__description')
y = 0
for h2 in gralInfo.find_all('h2'):
print (y)
text =''
for sibling in h2.find_next_siblings():
if (sibling.name == 'h2'):
break
else:
text = text + sibling.get_text(separator ='\n') + '\n'
print(text)
aux.append(text)
print()
print()
y = y + 1
auxi = []
for info in soup.find_all('div', class_='contact directory-listing-profile__master-detail'):
print(info.text)
auxi.append(info.text)
csv_writer.writerow([drugName, aux[0], aux[1], aux[2], aux[3], aux[4], aux[5], auxi[0], auxi[1], auxi[2]])

Write a list and strings in one row each in its own cell

I have extracted four items (one list of strings and three strings) and I want to write all four items in a single row. Am assume it is possible and just need a bit of help.
r_ingredients_wtht_tags = []
for link in r_links:
r = requests.get(link)
# print(r.status_code)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, "html.parser")
r_name = soup.find('h1', {'itemprop': 'name'}).text.replace('recipe', '')
r_prep_time = soup.select('li.recipe-meta-tag')[1].text
r_ingrdnts_with_tags = soup.select('span[itemprop="ingredients"]')
for r_ingrdnts in r_ingrdnts_with_tags: # Remove span tags from list items
r_ingredients_wtht_tags.append(r_ingrdnts.text)
# print(r_ingredients_wtht_tags)
# exit()
r_image_src = soup.find('img', {'itemprop': 'image'}).get('src')
r_image_url = 'https://website.com' + r_image_src # dummy website
r_url = link
# Download the recipe image
print('Downloading image %s...' % (r_image_url))
rec_image = requests.get(r_image_url)
rec_image.raise_for_status() # Will raise an exception if above request failed.
# Create image folder to store current recipe image
os.makedirs('recipe' + str(image_fold_count), exist_ok=True)
# Save recipe image
imageFile = open(os.path.join('recipe' + str(image_fold_count), os.path.basename(r_image_url)), 'wb')
for chunk in rec_image.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
# write to csv file. NOTE TO SELF: MOVE THIS TO ITS OWN FUNCTION
fileWriter.writerow(zip(r_name, r_prep_time, r_ingredients_wtht_tags, r_url +"\n"))
image_fold_count += 1 # Increment recipe folder counter
If r_ingredients_wtht_tags is a list and you want all four elements in one row but in separated cells then you can create list in this way
row = [r_name, r_prep_time, r_ingredients_wtht_tags, r_url]
or convert r_ingredients_wtht_tags into string with some separator.
You can use event , and csv will put this into " " automatically.
ingredients = ",".join(r_ingredients_wtht_tags)
row = [r_name, r_prep_time, ingredients, r_url]
If every ingredient has to be in separated cell the you can do
row = [r_name, r_prep_time] + r_ingredients_wtht_tags + [r_url]
And write it in csv without zip() (and without "\n" - csv will add it at the end of row)
fileWriter.writerow(row)

Creating a text-delimited file from HTML tables using BeautifulSoup

I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.

How to get specific table data from a table row using python and BeautifulSoup4

I've researched everywhere and I am having trouble.....I want to access each individual element and append the values to my empty python lists...and of course after I have the correct data in my python list I am going to append them to a csv file...
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen("http://exoweb0.donbest.com/checkHTML/servlet/send_archive_history.servlet?by_casino=1&for_casino=37&league=1&game=0&date=20151029")
soup = BeautifulSoup((html), "html.parser")
trs = soup.findAll("tr")
dates = []
rotations = []
games = []
lines = []
types = []
for tr in trs.children:
print(tr)
#tds = tr.findAll("td") 'Finds all tds in current tr
#print (tds)
#date = tds.contents[0] ' It reference the first element in each td
#rot = tds[2] ' It reference the third element in each td
#game = tds[3] ' It reference the fourth element in each td
#line = tds[4] ' It reference the fith element in each td
#type_l = tds[5] ' It reference the sixth element in each td
#dates.append(date) Trying to append to my empty list
#rotations.append(rot)
#games.append(game)
#lines.append(line)
#types.append(type_l)

Parse HTML table data to JSON and save to text file in Python 2.7

I'm trying to extract the data on the crime rate across states from
this webpage, link to web page
http://www.disastercenter.com/crime/uscrime.htm
I am able to get this into text file. But I would like to get the
response in Json format. How can I do this in python.
Here is my code:
import urllib
import re
from bs4 import BeautifulSoup
link = "http://www.disastercenter.com/crime/uscrime.htm"
f = urllib.urlopen(link)
myfile = f.read()
soup = BeautifulSoup(myfile)
soup1=soup.find('table', width="100%")
soup3=str(soup1)
result = re.sub("<.*?>", "", soup3)
print(result)
output=open("output.txt","w")
output.write(result)
output.close()
The following code will get the data from the two tables and output all of it as a json formatted string.
Working Example (Python 2.7.9):
from lxml import html
import requests
import re as regular_expression
import json
page = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = html.fromstring(page.text)
tables = [tree.xpath('//table/tbody/tr[2]/td/center/center/font/table/tbody'),
tree.xpath('//table/tbody/tr[5]/td/center/center/font/table/tbody')]
tabs = []
for table in tables:
tab = []
for row in table:
for col in row:
var = col.text_content()
var = var.strip().replace(" ", "")
var = var.split('\n')
if regular_expression.match('^\d{4}$', var[0].strip()):
tab_row = {}
tab_row["Year"] = var[0].strip()
tab_row["Population"] = var[1].strip()
tab_row["Total"] = var[2].strip()
tab_row["Violent"] = var[3].strip()
tab_row["Property"] = var[4].strip()
tab_row["Murder"] = var[5].strip()
tab_row["Forcible_Rape"] = var[6].strip()
tab_row["Robbery"] = var[7].strip()
tab_row["Aggravated_Assault"] = var[8].strip()
tab_row["Burglary"] = var[9].strip()
tab_row["Larceny_Theft"] = var[10].strip()
tab_row["Vehicle_Theft"] = var[11].strip()
tab.append(tab_row)
tabs.append(tab)
json_data = json.dumps(tabs)
output = open("output.txt", "w")
output.write(json_data)
output.close()
This might be what you want, if you can use the requests and lxml modules. The data structure presented here is very simple, adjust this to your needs.
First, get a response from your requested URL and parse the result into an HTML tree:
import requests
from lxml import etree
import json
response = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = etree.HTML(response.text)
Assuming you want to extract both tables, create this XPath and unpack the results. totals is "Number of Crimes" and rates is "Rate of Crime per 100,000 People":
xpath = './/table[#width="100%"][#style="background-color: rgb(255, 255, 255);"]//tbody'
totals, rates = tree.findall(xpath)
Extract the raw data (td.find('./') means first child item, whatever tag it has) and clean the strings (r'' raw strings are needed for Python 2.x):
raw_data = []
for tbody in totals, rates:
rows = []
for tr in tbody.getchildren():
row = []
for td in tr.getchildren():
child = td.find('./')
if child is not None and child.tag != 'br':
row.append(child.text.strip(r'\xa0').strip(r'\n').strip())
else:
row.append('')
rows.append(row)
raw_data.append(rows)
Zip together the table headers in the first two rows, then delete the redundant rows, seen as the 11th & 12th steps in slice notation:
data = {}
data['tags'] = [tag0 + tag1 for tag0, tag1 in zip(raw_data[0][0], raw_data[0][1])]
for raw in raw_data:
del raw[::12]
del raw[::11]
Store the rest of the raw data and create a JSON file (optional: eliminate whitespace with separators=(',', ':')):
data['totals'], data['rates'] = raw_data[0], raw_data[1]
with open('data.json', 'w') as f:
json.dump(data, f, separators=(',', ':'))

Categories

Resources