2일차(데이터 크롤링3)

Python/파이썬으로 배우는 머신러닝 기초 교육

2일차(데이터 크롤링3)

hyunjoo 2021. 7. 20. 18:05

<'네이버금융'사이트 시가총액 데이터 크롤링>

#네이버금융 홈페이지의 데이터크롤링이 필요한 이유

:현재 네이버금융 페이지의 코스피,코스닥 안에서는 특정한 키워드 검색이 불가능하기 때문에 크롤링을 통해 데이터를 가져온다면 키워드를 검색하거나 오름차순,내림차순 정렬 등이 가능하여 원하는 데이터에 더 쉽게 접근할 수 있음.

#코스닥 메뉴의 첫페이지 데이터 가져오기

>import requests
>from bs4 import  BeautifulSoup
>import pandas as pd 
>
>url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
>html=BeautifulSoup(url.text)
>
>table=html.select('table.type_2')[0]
>pd.read_html(str(table))

##설명

>table=html.select('table.type_2')[0] : table 변수에 데이터 저장

>pd.read_html(str(table)) : pandas 함수 중 read_html함수를 이용하여 html을 읽어옴.

>문자열 형태로 전달을 해줘야함.html을 문자열로...표를 읽어옴.. 리스트에 갇혀있음.

#리스트에서 표 꺼내기

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
html=BeautifulSoup(url.text)

table=html.select('table.type_2')[0]
pd.read_html(str(table))[0]           #리스트에서 표 꺼냄

#'N'열 과 '토론실' 열 제거하기

import requests
from bs4 import  BeautifulSoup
import pandas as pd #데이터 분석

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
html=BeautifulSoup(url.text)

table=html.select('table.type_2')[0]
table=pd.read_html(str(table))[0] 

del table['N']
del table['토론실']

table

#NaN값 제거하기

import requests
from bs4 import  BeautifulSoup
import pandas as pd #데이터 분석

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
html=BeautifulSoup(url.text)

table=html.select('table.type_2')[0]
table=pd.read_html(str(table))[0] 

del table['N']
del table['토론실']
#NaN 은 실선들임..
table[table['종목명'].notnull()]

##'종목명'기준으로 null값이 아닌 값만 남김.

#마지막 페이지까지 모든 데이터 가져오기 위해 마지막 페이지 크롤링

##소스코드를 보고 맨뒤 페이지가 32페이지라는 것을 알 수 있음.

#마지막 페이지 번호 가져오기위해 먼저 a태그에 접근하기

>url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
>html=BeautifulSoup(url.text)
>
>html.select('td.pgRR')[0].select('a')[0]


<a href="/sise/sise_market_sum.nhn?sosok=0&amp;page=32">맨뒤
				<img alt="" border="0" height="5" src="https://ssl.pstatic.net/static/n/cmn/bu_pgarRR.gif" width="8"/>
</a>

#a태그의 속성 값에 접근하기

>url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
>html=BeautifulSoup(url.text)
>
>html.select('td.pgRR')[0].select('a')[0]['href']#속성값에 접근해야함..이미a에는 접근했음,


/sise/sise_market_sum.nhn?sosok=0&page=32

#속성 값 중 숫자 데이터만 가져오기

>url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
>html=BeautifulSoup(url.text)
>
>html.select('td.pgRR')[0].select('a')[0]['href'][-2:]

32

#kospi_page라는 변수에 마지막 페이지 데이터를 정수형으로 저장하기

>url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
>html=BeautifulSoup(url.text)
>
>kospi_page=int(html.select('td.pgRR')[0].select('a')[0]['href'][-2:])

#전체 코드

import requests
from bs4 import  BeautifulSoup
import pandas as pd
from tqdm import tqdm #for문의 진행상황을 확인할 수 있는 라이브러리

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
html=BeautifulSoup(url.text)
kospi_page=int(html.select('td.pgRR')[0].select('a')[0]['href'][-2:])



kospi_box=[]

for n in tqdm(range(1,kospi_page+1)):
  url=requests.get(f"https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page={n}")
  html=BeautifulSoup(url.text)

  table=html.select('table.type_2')[0]
  table=pd.read_html(str(table))[0] 

  del table['N']
  del table['토론실']
  #NaN 은 실선들임..
  table=table[table['종목명'].notnull()]
  kospi_box.append(table)

##tqdm 라이브러리 사용하면 진행상황 알 수 있음.

#kospi_box변수에 32페이지의 모든 표 저장되었는지 확인하기

>len(kospi_box) #32개의 표(dataframe)가 쌓임

32

#32개의 표 합치기

pd.concat(kospi_box) #32개의 표를 하나의 표로 합치기

>>>이 표의 문제점!

각 페이지의 표를 연결시키는 과정에서 연속적인 번호를 부여하는게 아니라 페이지 당 번호 그대로 이어짐.

1.
2.
3.
4.
.
.

이런식으로....

#인덱스 처음부터 세기

pd.concat(kospi_box,ignore_index=True) #인덱스 처음부터 셀 수 있게 하기

#전체 코드

import requests
from bs4 import  BeautifulSoup
import pandas as pd
from tqdm import tqdm 

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
html=BeautifulSoup(url.text)
kospi_page=int(html.select('td.pgRR')[0].select('a')[0]['href'][-2:])



kospi_box=[]

for n in tqdm(range(1,kospi_page+1)):
  url=requests.get(f"https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page={n}")
  html=BeautifulSoup(url.text)

  table=html.select('table.type_2')[0]
  table=pd.read_html(str(table))[0] 

  del table['N']
  del table['토론실']
  
  table=table[table['종목명'].notnull()]
  kospi_box.append(table)

kospi=pd.concat(kospi_box,ignore_index=True)
kospi.to_excel('kospi.xlsx')
kospi

#표와 엑셀

<코스닥 데이터 크롤링&코스피와 합치기>

#코스닥 클롤링

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=1")
html=BeautifulSoup(url.text)
kosdaq_page=int(html.select('td.pgRR')[0].select('a')[0]['href'][-2:])

kosdaq_box=[]

for n in tqdm(range(1,kosdaq_page+1)):
  url=requests.get(f"https://finance.naver.com/sise/sise_market_sum.nhn?sosok=1&page={n}")
  html=BeautifulSoup(url.text)

  table=html.select('table.type_2')[0]
  table=pd.read_html(str(table))[0] 

  del table['N']
  del table['토론실']
  
  table=table[table['종목명'].notnull()]
  kosdaq_box.append(table)

kosdaq=pd.concat(kosdaq_box,ignore_index=True)
kosdaq.to_excel('kosdaq.xlsx')
kosdaq

#코스피와 코스닥 표 출력,새로운 열(종류)만들기

url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1")
html=BeautifulSoup(url.text)
kospi_page=int(html.select('td.pgRR')[0].select('a')[0]['href'][-2:])



kospi_box=[]

for n in tqdm(range(1,kospi_page+1)):
  url=requests.get(f"https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page={n}")
  html=BeautifulSoup(url.text)

  table=html.select('table.type_2')[0]
  table=pd.read_html(str(table))[0] 

  del table['N']
  del table['토론실']
  #NaN 은 실선들임..
  table=table[table['종목명'].notnull()]
  table['종류']=['KOSPI']*len(table)
  kospi_box.append(table)

kospi=pd.concat(kospi_box,ignore_index=True)
kospi.to_excel('kospi.xlsx')
kospi

#코스닥
url=requests.get("https://finance.naver.com/sise/sise_market_sum.nhn?sosok=1")
html=BeautifulSoup(url.text)
kosdaq_page=int(html.select('td.pgRR')[0].select('a')[0]['href'][-2:])

kosdaq_box=[]

for n in tqdm(range(1,kosdaq_page+1)):
  url=requests.get(f"https://finance.naver.com/sise/sise_market_sum.nhn?sosok=1&page={n}")
  html=BeautifulSoup(url.text)

  table=html.select('table.type_2')[0]
  table=pd.read_html(str(table))[0] 

  del table['N']
  del table['토론실']
  #NaN 은 실선들임..
  table=table[table['종목명'].notnull()]
  table['종류']=['KOSDAQ']*len(table)
  kosdaq_box.append(table)

kosdaq=pd.concat(kosdaq_box,ignore_index=True)
kosdaq.to_excel('kosdaq.xlsx')
kosdaq

#두 표 합치기

stock=pd.concat([kospi,kosdaq],ignore_index=True)
stock

'Python > 파이썬으로 배우는 머신러닝 기초 교육' 카테고리의 다른 글

3일차(데이터 분석 및 시각화) (0)	2021.07.25
3일차(데이터 가공) (0)	2021.07.23
2일차(데이터 크롤링2) (0)	2021.07.19
2일차(데이터 크롤링) (0)	2021.07.15
2일차(파이썬 기초) (0)	2021.07.15

현재글2일차(데이터 크롤링3)

LeeHyunjoo

bagging, 랭체인, 데이터통계, 크롤링, 배깅모델, CatBoost, 셀레니움, 분류성능지표, 백준, LangChain, 부스팅모델, ensemble model, 파이썬, 머신러닝, boosting, selenium, 세레니움 크롤링, PYTHON, LLM, f1score,

Today :
Yesterday :

일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

LeeHyunjoo