Data Wrangling

1 minute read

Published:

This post covers Data Gathering.

Data Gathering

import pandas as pd

csv_file = './data/goodreads/books.csv'
df = pd.read_csv(csv_file, error_bad_lines=False)
display(df.head(1))
isbn = df.loc[0, 'isbn']

print(isbn)

#url = 'https://isbnsearch.org/isbn/' + str(isbn13)
url = f'https://isbndb.com/book/{isbn}'
print(url)
# download using requests.get
import requests

url = f'https://isbndb.com/book/{isbn}'

response = requests.get(url)
file = f'./data/goodreads/pages/{isbn}.html'

with open (file, 'wb') as fp:
    fp.write(response.content)
# download using HTMLSession
from requests_html import HTMLSession

url = f'https://isbndb.com/book/{isbn}'

session = HTMLSession()
response = session.get(url)
file = f'./data/goodreads/pages/{isbn}.html'

with open (file, 'wb') as fp:
    fp.write(response.content)
#!pip install beautifulsoup4
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')

file = f'./data/goodreads/pages/{isbn}.html'

with open (file, 'rb') as fp:
    soup = BeautifulSoup(fp, 'lxml')
print(soup.find('title'))
print(soup.find('title').contents)
print(soup.find('title').contents[0])
print(soup.find('title').contents[0][:-len('ISBNdb')+4])
print(soup.find('h1'))
print(soup.find('h1').contents)
print(soup.find('h1').contents[0])
from bs4 import BeautifulSoup
import re

html = """
<ul>
<li>First</li>
<li>Second</li>
<li>Third</li>
</ul>
"""   
soup = BeautifulSoup(html,'html.parser')

for n in soup.find('li'):
  # It Give you one element     
  print(n)
    
print()
for n in soup.find_all('li'):    
  # It Give you all elements
  print(n)
import requests
from requests_html import HTMLSession

import pandas as pd
import os

csv_file = './data/goodreads/books.csv'
df = pd.read_csv(csv_file, error_bad_lines=False)
#display(df.head(1))


isbn_all = df.isbn.values
print(len(isbn_all))

for idx, isbn in enumerate(isbn_all[:10]):
    print(idx, end='  \r')
    url = f'https://isbndb.com/book/{isbn}'
    file = f'./data/goodreads/pages/{isbn}.html'

    response = requests.get(url)

    with open (file, 'wb') as fp:
        fp.write(response.content)
    #break
for idx, isbn in enumerate(isbn_all[:10]):
    
    file = f'./data/goodreads/pages/{isbn}.html'

    with open(file, 'rb') as fp:
        soup = BeautifulSoup(fp, 'lxml')
    title = soup.find('h1').contents[0]
    print(title)
DATA = {}
for idx, isbn in enumerate(isbn_all[:10]):
    
    file = f'./data/goodreads/pages/{isbn}.html'

    with open(file, 'rb') as fp:
        soup = BeautifulSoup(fp, 'lxml')
        
    table = soup.find('table', {'class': 'table table-hover table-responsive'})
    #print(f'TABLE: {table}')
    #print()
    data = []
    rows = table.findAll('tr')
    #print(title)
    for row in rows:
        #print(f'ROW: {row}')
        cells = row.findAll('td')
        
        for cell in cells:
            data.append(cell.text)
    print(data)    
    if len(data) == 3:
        DATA[isbn] = data
    print()
    #break
display(DATA)
df = pd.DataFrame(DATA).transpose()
df.columns = ['Title', 'ISBN13', 'PRICE']
display(df)

df.to_csv('data.csv')