HTML Parse

1. 개 요

지정된 웹 페이지 내용에서 사용자가 원하는 부분을 Parsing하여 CSV 파일, JSON 파일, SQLite(DB) 파일로 저장한다. 또한 BeatifulSoup을 활용하여 웹 크롤링으로 사용자가 원하는 데이터를 추출하도록 코딩한다.

2. 내 용

① HTMLParse - CSV

HTML Parse로 데이터 추출 후 CSV 파일로 저장한다.

URL] http://www.hanbit.co.kr/store/books/full_book_list.html

import re
import urllib.request
from html import unescape
import csv

url = "http://www.hanbit.co.kr/store/books/full_book_list.html"
req = urllib.request.urlopen(url)

header = req.info()
encoding = req.info().get_content_charset(failobj="utf-8")
html = req.read().decode(encoding)

with open("hanbit book list.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "URL"])

        for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
            url = re.search(r'<a href="(.*?)">', partial_html).group(1)
            book_url = "http://www.hanbit.co.kr" + url
            #print(book_url)

            title = re.sub(r'<.*?>','', partial_html)
            title = unescape(title)
            #print(title)

            writer.writerow([title, book_url])

② HTMLParse - JSON

HTML Parse로 데이터 추출 후 JSON 파일로 저장한다.

URL] http://www.hanbit.co.kr/store/books/full_book_list.html

import json, re
from urllib.request import urlopen
from html import unescape


req = urlopen("http://www.hanbit.co.kr/store/books/full_book_list.html")
encoding = req.info().get_content_charset(failobj="utf-8")
html = req.read().decode(encoding)


with open("test.json", "w", encoding="utf-8") as f:
    data = []


    for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
        url = re.search(r'<a href="(.*?)">', partial_html).group(1)
        url = 'http://www.hanbit.co.kr' + url
        title = re.sub(r'<.*?>', '', partial_html)
        title = unescape(title)


        data.append({"BookName": title, "Link": url})
        print(json.dumps(data, ensure_ascii=False, indent=2))


    json.dump(data, f, ensure_ascii=False, indent=2)

③ HTMLParse - SQLite

HTML Parse로 데이터 추출 후 SQLite 파일로 저장한다.

URL] http://www.hanbit.co.kr/store/books/full_book_list.html

import re, sqlite3
from urllib.request import urlopen
from html import unescape

def main():
    html = fetch("http://www.hanbit.co.kr/store/books/full_book_list.html")
    books = scrape(html)
    save("books.db",books)


def fetch(url):
    req = urlopen(url)
    encoding = req.info().get_content_charset(failobj="utf-8")
    html = req.read().decode(encoding)
    return html


def scrape(html):
    books = []
    for partial_html in re.findall(r'<td class="left"><a.*?</td>', html, re.DOTALL):
        url = re.search(r'<a href="(.*?)">', partial_html).group(1)
        url = "http://www.hanbit.co.kr" + url
        title = re.sub(r'<.*?>', '', partial_html)
        title = unescape(title)
        books.append({"url":url, "title":title})
    return books


def save(db_path, books):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute("DROP TABLE IF EXISTS books")
    c.execute(""" CREATE TABLE books(title text,url text)""")
    c.executemany("INSERT INTO books VALUES (:title, :url)", books)
    conn.commit()
    conn.close()

if __name__ == "__main__":
    main()

④ BeautifulSoup WEB Crawling

BeautifuSoup을 활용하여 HTML Parse로 데이터 추출 후 엑셀 파일로 저장한다.

--> 네이버 헤드라인 뉴스 정보 추출

URL] https://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=105

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import openpyxl


url = "https://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=105"
html = urlopen(url)

wb = openpyxl.Workbook()
sheet = wb.active

bs = BeautifulSoup(html.read(), 'html.parser')
#print(bs)
link = bs.findAll('div', {'class':'cluster_text'})

for li in link:
    headline = li.find('a', href=re.compile('^(https).*$'))
    #print(headline)
    if 'href' in headline.attrs:
        headline_href = headline.attrs['href']

    title = re.sub(r'<.*?>', '', str(headline))
    #print(title)

    #print(headline_href)
    #print(title + "\n")

    sheet.append([title, headline_href])

wb.save('D:/Projects/Project/news headline information.xlsx')

print("Success!!")