Data Wrangling
Published:
This post covers Data Gathering.
Data Gathering
import pandas as pd
csv_file = './data/goodreads/books.csv'
df = pd.read_csv(csv_file, error_bad_lines=False)
display(df.head(1))
isbn = df.loc[0, 'isbn']
print(isbn)
#url = 'https://isbnsearch.org/isbn/' + str(isbn13)
url = f'https://isbndb.com/book/{isbn}'
print(url)
# download using requests.get
import requests
url = f'https://isbndb.com/book/{isbn}'
response = requests.get(url)
file = f'./data/goodreads/pages/{isbn}.html'
with open (file, 'wb') as fp:
fp.write(response.content)
# download using HTMLSession
from requests_html import HTMLSession
url = f'https://isbndb.com/book/{isbn}'
session = HTMLSession()
response = session.get(url)
file = f'./data/goodreads/pages/{isbn}.html'
with open (file, 'wb') as fp:
fp.write(response.content)
#!pip install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
file = f'./data/goodreads/pages/{isbn}.html'
with open (file, 'rb') as fp:
soup = BeautifulSoup(fp, 'lxml')
print(soup.find('title'))
print(soup.find('title').contents)
print(soup.find('title').contents[0])
print(soup.find('title').contents[0][:-len('ISBNdb')+4])
print(soup.find('h1'))
print(soup.find('h1').contents)
print(soup.find('h1').contents[0])
from bs4 import BeautifulSoup
import re
html = """
<ul>
<li>First</li>
<li>Second</li>
<li>Third</li>
</ul>
"""
soup = BeautifulSoup(html,'html.parser')
for n in soup.find('li'):
# It Give you one element
print(n)
print()
for n in soup.find_all('li'):
# It Give you all elements
print(n)
import requests
from requests_html import HTMLSession
import pandas as pd
import os
csv_file = './data/goodreads/books.csv'
df = pd.read_csv(csv_file, error_bad_lines=False)
#display(df.head(1))
isbn_all = df.isbn.values
print(len(isbn_all))
for idx, isbn in enumerate(isbn_all[:10]):
print(idx, end=' \r')
url = f'https://isbndb.com/book/{isbn}'
file = f'./data/goodreads/pages/{isbn}.html'
response = requests.get(url)
with open (file, 'wb') as fp:
fp.write(response.content)
#break
for idx, isbn in enumerate(isbn_all[:10]):
file = f'./data/goodreads/pages/{isbn}.html'
with open(file, 'rb') as fp:
soup = BeautifulSoup(fp, 'lxml')
title = soup.find('h1').contents[0]
print(title)
DATA = {}
for idx, isbn in enumerate(isbn_all[:10]):
file = f'./data/goodreads/pages/{isbn}.html'
with open(file, 'rb') as fp:
soup = BeautifulSoup(fp, 'lxml')
table = soup.find('table', {'class': 'table table-hover table-responsive'})
#print(f'TABLE: {table}')
#print()
data = []
rows = table.findAll('tr')
#print(title)
for row in rows:
#print(f'ROW: {row}')
cells = row.findAll('td')
for cell in cells:
data.append(cell.text)
print(data)
if len(data) == 3:
DATA[isbn] = data
print()
#break
display(DATA)
df = pd.DataFrame(DATA).transpose()
df.columns = ['Title', 'ISBN13', 'PRICE']
display(df)
df.to_csv('data.csv')