๐ Python
12. ์น ํฌ๋กค๋ง
sa1t
2022. 6. 9. 10:56
## ์น ํฌ๋กค๋ง
# URL ๊ธฐ์ค์ผ๋ก ์ํ๋ ๋ฐ์ดํฐ๋ฅผ ๊ธ์ด์ค๋๋ฐ ์ฌ์ฉ
# ๋ค์ด๋ฒ ์นํฐ , ๋ ์จ ์ ๋ณด
###### 1. ๋ค์ด๋ฒ ๋ ์จ ์ ๋ณด๋ฅผ ๋ฌธ์์ด๋ก ๋ฐ์์ค๊ธฐ
import errno
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests
html = requests.get('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EB%82%A0%EC%94%A8')
#pprint(html.text)
# ๋ ์จ๋ฅผ ๋ค์ด๋ฒ์์ ๊ฒ์ => html ๋ฌธ์ ์กฐํ => html ๋ณ์์ ์ ์ฅ
soup = bs(html.text, 'html.parser')
# html.parser : ๋ฐ์ดํฐ ์ถ์ถ
data1 = soup.find('ul',{'class':'today_chart_list'})
#pprint(data1)
# ๊ฐ์ ํ์์ ๊ฐ์ง ๋ฐ์ดํฐ์ ์์ ๋ ์ด์ด ๋ด์ฉ ํ์ฑ
# ๊ฐ์ ํ์ : ๋ฏธ์ธ๋จผ์ง, ์ด๋ฏธ์ธ๋จผ์ง, ... > ๊ฐ์ ํ๊ทธ๋ก ๋ฌถ์ฌ์์์ ์ฝ๋๋ก ํ์ธ
data2 = data1.findAll('li')
#pprint(data2)
# ๋ฏธ์ธ๋จผ์ง, ์ด๋ฏธ์ธ ๋จผ์ง ... 4๊ฐ์ ๊ฐ์ฒด ๋ด์ฉ ํ์ฑ
fine_dust= data2[0].find('span',{'class':'txt'}).text
print(fine_dust)
# ๊ฐ์ฒด ์ค ์ฒซ๋ฒ์งธ์ธ ๋ฏธ์ธ๋จผ์ง ์ ๋ณด ํ์ฑ
# .text : ํ์ฑํ ๋ด์ฉ ์ค ํ๊ทธ๋ html ๋ฌธ๋ฒ ์ ์ธํ ๋ฌธ์์ด๋ง ์ถ์ถ
# Quiz) ์ด๋ฏธ์ธ๋จผ์ง, ์์ธ์ , ์ผ๋ชฐ : ํ
์คํธ ์ ๋ณด ์ถ์ถ
# print("์ค๋์ ์ด๋ฏธ์ธ๋จผ์ง ๋๋๋ %s์
๋๋ค " %())
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests
html = requests.get('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EB%82%A0%EC%94%A8')
#pprint(html.text)
# ๋ ์จ๋ฅผ ๋ค์ด๋ฒ์์ ๊ฒ์ => html ๋ฌธ์ ์กฐํ => html ๋ณ์์ ์ ์ฅ
soup = bs(html.text, 'html.parser')
# html.parser : ๋ฐ์ดํฐ ์ถ์ถ
data1 = soup.find('ul',{'class':'today_chart_list'})
pprint(data1)
# ๊ฐ์ ํ์์ ๊ฐ์ง ๋ฐ์ดํฐ์ ์์ ๋ ์ด์ด ๋ด์ฉ ํ์ฑ
# ๊ฐ์ ํ์ : ๋ฏธ์ธ๋จผ์ง, ์ด๋ฏธ์ธ๋จผ์ง, ... > ๊ฐ์ ํ๊ทธ๋ก ๋ฌถ์ฌ์์์ ์ฝ๋๋ก ํ์ธ
data2 = data1.findAll('li')
pprint(data2)
# ๋ฏธ์ธ๋จผ์ง, ์ด๋ฏธ์ธ ๋จผ์ง ... 4๊ฐ์ ๊ฐ์ฒด ๋ด์ฉ ํ์ฑ
fine_sodust=data2[1].find('span',{'class':'txt'}).text
fine_sun=data2[2].find('span',{'class':'txt'}).text
fine_sunrise=data2[3].find('span',{'class':'txt'}).text
print("์ค๋์ ์ด๋ฏธ์ธ๋จผ์ง ๋๋๋ [%s] ์
๋๋ค" %fine_sodust)
print("์ค๋์ ์์ธ์ ์ง์๋ [%s] ์
๋๋ค" %fine_sun)
print("์ค๋์ ์ผ๋ชฐ ์๊ฐ์ [%s] ์
๋๋ค" %fine_sunrise)
###### 2. ๋ค์ด๋ฒ ์นํฐ ์ ๋ชฉ ๊ฐ์ ธ์ค๊ธฐ
from bs4 import BeautifulSoup
from pprint import pprint
import requests
html = requests.get("https://comic.naver.com/webtoon/weekday") # ํฌ๋กค๋งํ url ๋ฐ์์ค๊ธฐ
soup = bs(html.text, 'html.parser')
html.close() # get์ผ๋ก ์ฐ ํ์๋ close๋ก ๋ซ์์ผํจ
#pprint(html.text)
# ์์์ผ ์นํฐ ์์ญ ์ถ์ถํ๊ธฐ
data1 = soup.find('div',{'class':'col_inner'})
#pprint(data1)
# ์ ๋ชฉ ํฌํจ ์์ญ ์ถ์ถํ๊ธฐ
data2 = data1.findAll('a',{'class':'title'}) # a์ ํฌํจ๋์ด์๋ class๋ก title ์ถ์ถ
pprint(data2)
# ์ ๋ชฉ ํ
์คํธ๋ง ์ถ์ถ
title_list=[]
#for i in data2 :
# title_list.append(i.text)
title_list = [i.text for i in data2] # ์์ for ๋ฌธ๊ณผ ๋์ผํ ๋ฌธ๋ฒ
pprint(title_list)
# ๋ชจ๋ ์์ผ ์นํฐ ์ ๋ชฉ ๊ฐ์ ธ์ค๊ธฐ
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests
html = requests.get("https://comic.naver.com/webtoon/weekday") # ํฌ๋กค๋งํ url ๋ฐ์์ค๊ธฐ
soup = bs(html.text, 'html.parser')
html.close() # get์ผ๋ก ์ฐ ํ์๋ close๋ก ๋ซ์์ผํจ
pprint(html.text)
# ๋ชจ๋ ์์ผ ์นํฐ ์์ญ ์ถ์ถํ๊ธฐ => ๋ฆฌ์คํธ
data1_list=soup.findAll('div',{'class':'col_inner'})
#pprint(data1_list)
week_title_list = []
for data1 in data1_list:
# ์ ๋ชฉ ์์ญ ํฌํจํด์ ์ถ์ถํ๊ธฐ
data2 = data1.findAll('a',{'class':'title'})
#pprint(data2)
title_list=[i.text for i in data2]
#pprint(title_list)
# ์์ผ๋ณ ๋ฆฌ์คํธ ๋ง๋ค๊ธฐ
week_title_list.extend(title_list)
# ์์ผ๋ณ ๋ฆฌ์คํธ ํ๋๋ก ํฉ์น๊ธฐ
pprint(week_title_list)
### ์ ์ด๋ถํฐ ํ๋๋ก ํฉ์น๊ธฐ
data1 = soup.findAll('a',{'class':'title'})
week_title_list = [i.text for i in data1]
## Quiz) ์๊ฒฐ ์นํฐ => ์ธ๊ธฐ์ => 1์ ~ 50์ ๊น์ง ๋ฆฌ์คํธ ๋ง๋ค๊ธฐ
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests
html = requests.get("https://comic.naver.com/webtoon/finish?order=User&view=list") # ํฌ๋กค๋งํ url ๋ฐ์์ค๊ธฐ
soup = bs(html.text, 'html.parser')
html.close() # get์ผ๋ก ์ฐ ํ์๋ close๋ก ๋ซ์์ผํจ
#pprint(html.text)
data1_list=soup.findAll('div',{'class':'list_area table_list_area'})
#pprint(data1_list)
for data1 in data1_list:
data2 = data1.findAll('td',{'class':'subject'})
#pprint(data2)
title_list=[]
for i in data2 :
title_list.append(i.text)
print([l.strip() for l in title_list[0:51]])
###### 3. ๋ค์ด๋ฒ ์นํฐ ์ธ๋ค์ผ ๋ถ๋ฌ์ค๊ธฐ
from bs4 import BeautifulSoup
from pprint import pprint
import requests, re, os
from urllib.request import urlretrieve #์ถ๊ฐ
#์ ์ฅ ํด๋๋ฅผ ์์ฑ
try:
if not (os.path.isdir('image')):
os.makedirs(os.path.join('image'))
except OSError as e:
if e.errno != errno.EEXIST:
print("ํด๋ ์์ฑ ์คํจ!")
exit()
#์น ํ์ด์ง๋ฅผ ์ด๊ณ ์์ค์ฝ๋๋ฅผ ์ฝ์ด์ค๋ ์์
html = requests.get("http://comic.naver.com/webtoon/weekday.nhn")
soup = BeautifulSoup(html.text, 'html.parser')
html.close()
#์์ผ๋ณ ์นํฐ์์ญ ์ถ์ถํ๊ธฐ
data1_list=soup.findAll('div',{'class':'col_inner'})
# pprint(data1_list)
#์ ์ฒด ์นํฐ ๋ฆฌ์คํธ
li_list = []
for data1 in data1_list:
#์ ๋ชฉ+์ธ๋ค์ผ ์์ญ ์ถ์ถ
li_list.extend(data1.findAll('li')) #ํด๋น ๋ถ๋ถ์ ์ฐพ์ li_list์ ๋ณํฉ
# pprint(li_list)
#๊ฐ๊ฐ ์ธ๋ค์ผ๊ณผ ์ ๋ชฉ ์ถ์ถํ๊ธฐ
for li in li_list:
img = li.find('img')
title = img['title']
img_src = img['src']
# print(title,img_src)
title = re.sub('[^0-9a-zA-Zใฑ-ํ]', '', title) #ํด๋น ์์ญ์ ๊ธ์๊ฐ ์๋ ๊ฒ์ ''๋ก ์นํ์ํจ๋ค.
urlretrieve( img_src , './image/'+title+'.jpg') #์ฃผ์, ํ์ผ๊ฒฝ๋ก+ํ์ผ๋ช
+ํ์ฅ์
# Quiz) ๋ค์ด๋ฒ ์ผํ > KF94 ๋ง์คํฌ ๊ฒ์
# ํ์ด์ง์ ๊ณต๊ฐ๋ ์ธ๋ค์ผ image2 ํด๋์ ๋ค์ด๋ก๋
from bs4 import BeautifulSoup
from pprint import pprint
import requests, re, os
from urllib.request import urlretrieve #์ถ๊ฐ
#์ ์ฅ ํด๋๋ฅผ ์์ฑ
try:
if not (os.path.isdir('image')):
os.makedirs(os.path.join('image'))
except OSError as e:
if e.errno != errno.EEXIST:
print("ํด๋ ์์ฑ ์คํจ!")
exit()
#์น ํ์ด์ง๋ฅผ ์ด๊ณ ์์ค์ฝ๋๋ฅผ ์ฝ์ด์ค๋ ์์
html = requests.get("https://search.shopping.naver.com/search/all?query=kf94&cat_id=&frm=NVSHATC")
soup = BeautifulSoup(html.text, 'html.parser')
html.close()
data1_list=soup.findAll('ul',{'class':'list_basis'})
#pprint(data1_list)
div_list=[]
for data1 in data1_list:
div_list.extend(data1.findAll('a',{'class':'thumbnail_thumb__3Agq6'}))
#pprint(div_list)
for div in div_list:
img = div.find('img')
title = img['title']
img_src = img['src']
print(title,img_src)
title = re.sub('[^0-9a-zA-Zใฑ-ํ]', '', title) #ํด๋น ์์ญ์ ๊ธ์๊ฐ ์๋ ๊ฒ์ ''๋ก ์นํ์ํจ๋ค.
urlretrieve( img_src , './image/'+title+'.jpg') #์ฃผ์, ํ์ผ๊ฒฝ๋ก+ํ์ผ๋ช
+ํ์ฅ์