๐Ÿ“„ Python

12. ์›น ํฌ๋กค๋ง

sa1t 2022. 6. 9. 10:56
๋ฐ˜์‘ํ˜•
## ์›น ํฌ๋กค๋ง
# URL ๊ธฐ์ค€์œผ๋กœ ์›ํ•˜๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ๊ธ์–ด์˜ค๋Š”๋ฐ ์‚ฌ์šฉ
# ๋„ค์ด๋ฒ„ ์›นํˆฐ , ๋‚ ์”จ ์ •๋ณด

###### 1. ๋„ค์ด๋ฒ„ ๋‚ ์”จ ์ •๋ณด๋ฅผ ๋ฌธ์ž์—ด๋กœ ๋ฐ›์•„์˜ค๊ธฐ
import errno

from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests

html = requests.get('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EB%82%A0%EC%94%A8')
#pprint(html.text)
# ๋‚ ์”จ๋ฅผ ๋„ค์ด๋ฒ„์—์„œ ๊ฒ€์ƒ‰ => html ๋ฌธ์„œ ์กฐํšŒ => html ๋ณ€์ˆ˜์— ์ €์žฅ

soup = bs(html.text, 'html.parser')
# html.parser : ๋ฐ์ดํ„ฐ ์ถ”์ถœ

data1 = soup.find('ul',{'class':'today_chart_list'})
#pprint(data1)
# ๊ฐ™์€ ํ˜•์‹์„ ๊ฐ€์ง„ ๋ฐ์ดํ„ฐ์˜ ์ƒ์œ„ ๋ ˆ์ด์–ด ๋‚ด์šฉ ํŒŒ์‹ฑ
# ๊ฐ™์€ ํ˜•์‹ : ๋ฏธ์„ธ๋จผ์ง€, ์ดˆ๋ฏธ์„ธ๋จผ์ง€, ... > ๊ฐ™์€ ํƒœ๊ทธ๋กœ ๋ฌถ์—ฌ์žˆ์Œ์„ ์ฝ”๋“œ๋กœ ํ™•์ธ

data2 = data1.findAll('li')
#pprint(data2)
# ๋ฏธ์„ธ๋จผ์ง€, ์ดˆ๋ฏธ์„ธ ๋จผ์ง€ ... 4๊ฐœ์˜ ๊ฐ์ฒด ๋‚ด์šฉ ํŒŒ์‹ฑ

fine_dust= data2[0].find('span',{'class':'txt'}).text
print(fine_dust)
# ๊ฐ์ฒด ์ค‘ ์ฒซ๋ฒˆ์งธ์ธ ๋ฏธ์„ธ๋จผ์ง€ ์ •๋ณด ํŒŒ์‹ฑ
# .text : ํŒŒ์‹ฑํ•œ ๋‚ด์šฉ ์ค‘ ํƒœ๊ทธ๋‚˜ html ๋ฌธ๋ฒ• ์ œ์™ธํ•œ ๋ฌธ์ž์—ด๋งŒ ์ถ”์ถœ

# Quiz) ์ดˆ๋ฏธ์„ธ๋จผ์ง€, ์ž์™ธ์„ , ์ผ๋ชฐ : ํ…์ŠคํŠธ ์ •๋ณด ์ถ”์ถœ
# print("์˜ค๋Š˜์˜ ์ดˆ๋ฏธ์„ธ๋จผ์ง€ ๋†๋„๋Š” %s์ž…๋‹ˆ๋‹ค " %())
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests

html = requests.get('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EB%82%A0%EC%94%A8')
#pprint(html.text)
# ๋‚ ์”จ๋ฅผ ๋„ค์ด๋ฒ„์—์„œ ๊ฒ€์ƒ‰ => html ๋ฌธ์„œ ์กฐํšŒ => html ๋ณ€์ˆ˜์— ์ €์žฅ

soup = bs(html.text, 'html.parser')
# html.parser : ๋ฐ์ดํ„ฐ ์ถ”์ถœ

data1 = soup.find('ul',{'class':'today_chart_list'})
pprint(data1)
# ๊ฐ™์€ ํ˜•์‹์„ ๊ฐ€์ง„ ๋ฐ์ดํ„ฐ์˜ ์ƒ์œ„ ๋ ˆ์ด์–ด ๋‚ด์šฉ ํŒŒ์‹ฑ
# ๊ฐ™์€ ํ˜•์‹ : ๋ฏธ์„ธ๋จผ์ง€, ์ดˆ๋ฏธ์„ธ๋จผ์ง€, ... > ๊ฐ™์€ ํƒœ๊ทธ๋กœ ๋ฌถ์—ฌ์žˆ์Œ์„ ์ฝ”๋“œ๋กœ ํ™•์ธ

data2 = data1.findAll('li')
pprint(data2)
# ๋ฏธ์„ธ๋จผ์ง€, ์ดˆ๋ฏธ์„ธ ๋จผ์ง€ ... 4๊ฐœ์˜ ๊ฐ์ฒด ๋‚ด์šฉ ํŒŒ์‹ฑ

fine_sodust=data2[1].find('span',{'class':'txt'}).text
fine_sun=data2[2].find('span',{'class':'txt'}).text
fine_sunrise=data2[3].find('span',{'class':'txt'}).text
print("์˜ค๋Š˜์˜ ์ดˆ๋ฏธ์„ธ๋จผ์ง€ ๋†๋„๋Š” [%s] ์ž…๋‹ˆ๋‹ค" %fine_sodust)
print("์˜ค๋Š˜์˜ ์ž์™ธ์„  ์ง€์ˆ˜๋Š” [%s] ์ž…๋‹ˆ๋‹ค" %fine_sun)
print("์˜ค๋Š˜์˜ ์ผ๋ชฐ ์‹œ๊ฐ„์€ [%s] ์ž…๋‹ˆ๋‹ค" %fine_sunrise)

###### 2. ๋„ค์ด๋ฒ„ ์›นํˆฐ ์ œ๋ชฉ ๊ฐ€์ ธ์˜ค๊ธฐ
from bs4 import BeautifulSoup
from pprint import pprint
import requests

html = requests.get("https://comic.naver.com/webtoon/weekday")  # ํฌ๋กค๋งํ•  url ๋ฐ›์•„์˜ค๊ธฐ
soup = bs(html.text, 'html.parser')
html.close()    # get์œผ๋กœ ์—ฐ ํ›„์—๋Š” close๋กœ ๋‹ซ์•„์•ผํ•จ
#pprint(html.text)

# ์›”์š”์ผ ์›นํˆฐ ์˜์—ญ ์ถ”์ถœํ•˜๊ธฐ
data1 = soup.find('div',{'class':'col_inner'})
#pprint(data1)

# ์ œ๋ชฉ ํฌํ•จ ์˜์—ญ ์ถ”์ถœํ•˜๊ธฐ
data2 = data1.findAll('a',{'class':'title'})    # a์— ํฌํ•จ๋˜์–ด์žˆ๋Š” class๋กœ title ์ถ”์ถœ
pprint(data2)

# ์ œ๋ชฉ ํ…์ŠคํŠธ๋งŒ ์ถ”์ถœ
title_list=[]
#for i in data2 :
#    title_list.append(i.text)

title_list = [i.text for i in data2]    # ์œ„์˜ for ๋ฌธ๊ณผ ๋™์ผํ•œ ๋ฌธ๋ฒ•

pprint(title_list)



# ๋ชจ๋“  ์š”์ผ ์›นํˆฐ ์ œ๋ชฉ ๊ฐ€์ ธ์˜ค๊ธฐ
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests

html = requests.get("https://comic.naver.com/webtoon/weekday")  # ํฌ๋กค๋งํ•  url ๋ฐ›์•„์˜ค๊ธฐ
soup = bs(html.text, 'html.parser')
html.close()    # get์œผ๋กœ ์—ฐ ํ›„์—๋Š” close๋กœ ๋‹ซ์•„์•ผํ•จ
pprint(html.text)

# ๋ชจ๋“  ์š”์ผ ์›นํˆฐ ์˜์—ญ ์ถ”์ถœํ•˜๊ธฐ => ๋ฆฌ์ŠคํŠธ
data1_list=soup.findAll('div',{'class':'col_inner'})
#pprint(data1_list)

week_title_list = []
for data1 in data1_list:
    # ์ œ๋ชฉ ์˜์—ญ ํฌํ•จํ•ด์„œ ์ถ”์ถœํ•˜๊ธฐ
    data2 = data1.findAll('a',{'class':'title'})
    #pprint(data2)

    title_list=[i.text for i in data2]
    #pprint(title_list)
    # ์š”์ผ๋ณ„ ๋ฆฌ์ŠคํŠธ ๋งŒ๋“ค๊ธฐ

    week_title_list.extend(title_list)
    # ์š”์ผ๋ณ„ ๋ฆฌ์ŠคํŠธ ํ•˜๋‚˜๋กœ ํ•ฉ์น˜๊ธฐ

pprint(week_title_list)

### ์• ์ดˆ๋ถ€ํ„ฐ ํ•˜๋‚˜๋กœ ํ•ฉ์น˜๊ธฐ
data1 = soup.findAll('a',{'class':'title'})
week_title_list = [i.text for i in data1]

## Quiz) ์™„๊ฒฐ ์›นํˆฐ => ์ธ๊ธฐ์ˆœ => 1์œ„ ~ 50์œ„ ๊นŒ์ง€ ๋ฆฌ์ŠคํŠธ ๋งŒ๋“ค๊ธฐ

from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests

html = requests.get("https://comic.naver.com/webtoon/finish?order=User&view=list")  # ํฌ๋กค๋งํ•  url ๋ฐ›์•„์˜ค๊ธฐ
soup = bs(html.text, 'html.parser')
html.close()    # get์œผ๋กœ ์—ฐ ํ›„์—๋Š” close๋กœ ๋‹ซ์•„์•ผํ•จ
#pprint(html.text)

data1_list=soup.findAll('div',{'class':'list_area table_list_area'})
#pprint(data1_list)

for data1 in data1_list:
    data2 = data1.findAll('td',{'class':'subject'})
    #pprint(data2)

title_list=[]

for i in data2 :
    title_list.append(i.text)

print([l.strip() for l in title_list[0:51]])


###### 3. ๋„ค์ด๋ฒ„ ์›นํˆฐ ์ธ๋„ค์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
from bs4 import BeautifulSoup
from pprint import pprint
import requests, re, os
from urllib.request import urlretrieve #์ถ”๊ฐ€

#์ €์žฅ ํด๋”๋ฅผ ์ƒ์„ฑ
try:
    if not (os.path.isdir('image')):
        os.makedirs(os.path.join('image'))
except OSError as e:
    if e.errno != errno.EEXIST:
        print("ํด๋” ์ƒ์„ฑ ์‹คํŒจ!")
        exit()

#์›น ํŽ˜์ด์ง€๋ฅผ ์—ด๊ณ  ์†Œ์Šค์ฝ”๋“œ๋ฅผ ์ฝ์–ด์˜ค๋Š” ์ž‘์—…
html = requests.get("http://comic.naver.com/webtoon/weekday.nhn")
soup = BeautifulSoup(html.text, 'html.parser')
html.close()

#์š”์ผ๋ณ„ ์›นํˆฐ์˜์—ญ ์ถ”์ถœํ•˜๊ธฐ
data1_list=soup.findAll('div',{'class':'col_inner'})
# pprint(data1_list)

#์ „์ฒด ์›นํˆฐ ๋ฆฌ์ŠคํŠธ
li_list = []
for data1 in data1_list:
    #์ œ๋ชฉ+์ธ๋„ค์ผ ์˜์—ญ ์ถ”์ถœ
    li_list.extend(data1.findAll('li')) #ํ•ด๋‹น ๋ถ€๋ถ„์„ ์ฐพ์•„ li_list์™€ ๋ณ‘ํ•ฉ
# pprint(li_list)

#๊ฐ๊ฐ ์ธ๋„ค์ผ๊ณผ ์ œ๋ชฉ ์ถ”์ถœํ•˜๊ธฐ
for li in li_list:
    img = li.find('img')
    title = img['title']
    img_src = img['src']
    # print(title,img_src)
    title = re.sub('[^0-9a-zA-Zใ„ฑ-ํž—]', '', title) #ํ•ด๋‹น ์˜์—ญ์˜ ๊ธ€์ž๊ฐ€ ์•„๋‹Œ ๊ฒƒ์€ ''๋กœ ์น˜ํ™˜์‹œํ‚จ๋‹ค.
    urlretrieve( img_src , './image/'+title+'.jpg') #์ฃผ์†Œ, ํŒŒ์ผ๊ฒฝ๋กœ+ํŒŒ์ผ๋ช…+ํ™•์žฅ์ž


# Quiz) ๋„ค์ด๋ฒ„ ์‡ผํ•‘ > KF94 ๋งˆ์Šคํฌ ๊ฒ€์ƒ‰
# ํŽ˜์ด์ง€์— ๊ณต๊ฐœ๋œ ์ธ๋„ค์ผ image2 ํด๋”์— ๋‹ค์šด๋กœ๋“œ
from bs4 import BeautifulSoup
from pprint import pprint
import requests, re, os
from urllib.request import urlretrieve #์ถ”๊ฐ€

#์ €์žฅ ํด๋”๋ฅผ ์ƒ์„ฑ
try:
    if not (os.path.isdir('image')):
        os.makedirs(os.path.join('image'))
except OSError as e:
    if e.errno != errno.EEXIST:
        print("ํด๋” ์ƒ์„ฑ ์‹คํŒจ!")
        exit()

#์›น ํŽ˜์ด์ง€๋ฅผ ์—ด๊ณ  ์†Œ์Šค์ฝ”๋“œ๋ฅผ ์ฝ์–ด์˜ค๋Š” ์ž‘์—…
html = requests.get("https://search.shopping.naver.com/search/all?query=kf94&cat_id=&frm=NVSHATC")
soup = BeautifulSoup(html.text, 'html.parser')
html.close()

data1_list=soup.findAll('ul',{'class':'list_basis'})
#pprint(data1_list)

div_list=[]
for data1 in data1_list:
    div_list.extend(data1.findAll('a',{'class':'thumbnail_thumb__3Agq6'}))
#pprint(div_list)

for div in div_list:
    img = div.find('img')
    title = img['title']
    img_src = img['src']
    print(title,img_src)
    title = re.sub('[^0-9a-zA-Zใ„ฑ-ํž—]', '', title) #ํ•ด๋‹น ์˜์—ญ์˜ ๊ธ€์ž๊ฐ€ ์•„๋‹Œ ๊ฒƒ์€ ''๋กœ ์น˜ํ™˜์‹œํ‚จ๋‹ค.
    urlretrieve( img_src , './image/'+title+'.jpg') #์ฃผ์†Œ, ํŒŒ์ผ๊ฒฝ๋กœ+ํŒŒ์ผ๋ช…+ํ™•์žฅ์ž
๋ฐ˜์‘ํ˜•