[파이썬] 분석데이터 수집 및 분석(전처리)

GOGO치삼 2020. 4. 23. 09:49

2020. 4. 23. 09:49

# -*- coding: utf-8 -*-
# db 연결
import sqlite3
con = sqlite3.connect("C:/pythonwork/naver_db")
import pandas as pd
from pandas import Series, DataFrame

readed_df = pd.read_sql("SELECT * FROM naver_table", con)
print(readed_df.describe())


# 문제1. 평균/최대/최소/4분위 값을 각각 구하시오

# 평균
mean=readed_df["가격"].mean()
print(mean)
# 최대
max=readed_df["가격"].max()
print(max)
# 최소
min=readed_df["가격"].min()
print(min)

# 4분위
import pandas as pd
quantile=readed_df["가격"].quantile()
total=pd.DataFrame({"4분위값":quantile},index=[0])
print(total)


# 문제2. 읽어들인 데이터를 MatplotLib으로 구하시오
import matplotlib.pyplot as plt
plt.plot(readed_df["제품명"], readed_df["가격"])

print(read_df)
con.commit()
con.close()

# BeautifulSoup 사용하기

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'html.parser')

# html 문서의 계층 구조를 알기 쉽게 보여준다.(마치 코드 처럼)
print(soup.prettify()) 
''''
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
'''

#타이틀 태그나 태그명을 추출하고 싶을 경우
soup.title
print(soup.title)
#<title>The Dormouse's story</title>

# 타이틀 태그의 이름('title')을 반환
soup.title.name
print(soup.title.name)
# title

#타이틀태그의 문자열을 반환
soup.title.string
print(soup.title.string)
#The Dormouse's story

# 타이틀 태그의 부모 태그의 이름을 반환
soup.title.parent.name
print(soup.title.parent.name)
# head

# 첫 p 태그를 반환
soup.p
print(soup.p)
#<p class="title"><b>The Dormouse's story</b></p>

#'class' 속성이 있는 첫 p태그를 반환
print(soup.p['class'])
# ['title']

# 첫 a 태그를 반환
soup.a
print(soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


#모든 a태그를 리스트 형태로 반환
soup.find_all()
print(soup.find_all('a'))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# soup.find() 설정한 값에 해당하는 태그를 반환.
# id가 link3인 태그를 반환.
soup.find(id="link3")
#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


#.get() : href 속성을 반환.
for link in soup.find_all('a'):
    print(link.get('href'))

#http://example.com/elsie
#http://example.com/lacie
#http://example.com/tillie


#get_text() :html 문서 안에 있는 텍스트를 반환
print(soup.get_text())
'''
The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...

-----------------------------------------------------------------
#requests 기본 사용

html 소스 가져오기
requests를 사용하면 아래와 같이 간단한 코드만으로
웹페이지의 html 소스를 가져올 수 있다.
'''

import requests

'''
웹페이지의 content를 유니코드 형태가 아니라 bytes 형태로 얻기 위해서는
r.text가 아닌 r.content를 사용할 수도 있다.
'''
r = requests.get('https://google.com')
html = r.content
print(html)


#response 객체 ㅣesquest.get()의 반환 객체
'''
response 객체는 HTTP request에 의한 서버의 응답 정보를 갖고 있다,
starus_code, headers, encoding, ok등의 속성을 이용하면
다양한 정보를 얻을 수 있디.
'''
r = requests.get('https://google.com')
html = r.content

print(r.status_code)
print(r.headers['Content-Type'])
print(r.encoding)
print(r.ok)

'''
status_code는
정상일 경우 200, 페이지가 발견되지 않을 경우 404

encoding 방식은 ISO-8859-1이고
요청에 대한 응답이 정상적으로 이루어 졌음을 알 수 있다.

(status_code가 200 보다 작거나 같은 경우 True, 그렇지 않는 경우 False)
'''
'''
만약 인코딩 방식이 달라서 한글이 재대로 표시되지 않으면
아래와 같이 인코딩 방식을 변경.
'''

r.encoding ='UTF-8'

'''
requests를 이용해서 html 소스를 가져왔지만
단순한 문자열 형태이기 때문에 파싱(parsing)에 적합하지 않다.
그렇기 떄문에 BeautifulSoup으f 이용해서
파이썬 html 소스를 분석하고 데이터를 추출하기 편리하도록
객체로 변환
'''

# 많이 본 네이버 뉴스
'''
파이썬 과 BeautifulSoup을 이용하면
이 웹 크롤러를 간단하게 만들 수 있다.
네이버 뉴스의 많이 본 뉴스를 가져오기
'''

import requests
from bs4 import BeautifulSoup

'''
https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date=20190430

위의 주소에서 알 수 있듯이 맨위에 날짜를 바꿔주면 
해당하는 날짜의 많이 본 뉴스를 볼 수 있다.
'''

url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date=20190430'

r = requests.get(url)
html = r.content
soup = BeautifulSoup(html,'html.parser')

# 태그안에 태그를 찾고 싶다.
# 원하는 데이터 추출하기
# 네이버 많이 본 뉴스 페이지에서 헤드라인만 추출해서 출력

titles_html = soup.select('.ranking_section > ol >li >dl >dt >a')

# 30개의 헤드라인이 순서대로 출력
for i in range(len(titles_html)):
     print(i+1, titles_html[i].text)
    
# 삼성전자 주식 일별 시세 가져오기
 '''
네이버 증권에서 제공하는
삼성전자 종목(005930)의 일별 시세를 가져오가

주소:http://finance.naver.com/item/sise_day.nhn?code=005930
위의 주소와 같이 뒷부분에
code=005930와 같이 종목코드를 입력해주면
해당 종목의 일별시세를 볼 수 있다.
'''

#원하는 데이터 추출하기
'''
종목의 일별시세 페이지에서
날짜,종가, 거래량만 추출해서 출력해보겠다,

개발자 도구 (컨트롤 + 쉬프트 +i 또는 F12)를 통해 소스를보면
날짜, 종가, 거래량이 나온 부분을 찾을 수 있다.

'table', 'tr', 'td'태그 안의 텍스트임을 할 수 있다.
'''

import requests
from bs4 import BeautifulSoup as bs

#종목의 코드와 페이지 수를 입력하는 함수.
def print_stock_price(code,page_num):
    #result에는 날짜, 종가, 거래량이 추가된다.
    result = [[],[],[]]

    #주소 뒷부분에 &page=2와 같은 형식으로 연결해주면
    #해당페이지의 일별시세를 볼 수 있다.
    for n in range(page_num):
        url="http://finance.naver.com/item/sise_day.nhn?code="+code+'&page='+str(n+1)

        r = requests.get(url)
        html = r.content
        soup = bs(html,"html.parser")

        #table 안의 tr태그를 리스트형태로 가져온다.
        tr = soup.select('table>tr')

        #첫번째 tr태그는 th태그가,
        #마지막 tr태그는 페이지 넘버가 있어서 제외
        for i in range(1, len(tr)-1):
            #text가 없는 row가 존재.
            if tr[i].select('td')[0].text.strip():
                #text가 있는 row에 대해서
                #첫번째(날짜),두번재(종가),일곱번째(거래량)
                #td태그의text를 가져온다.
                result[0].append(tr[i].select('td')[0].text.strip())
                result[1].append(tr[i].select('td')[1].text.strip())
                result[2].append(tr[i].select('td')[6].text.strip())

    for i in range(len(result[0])):
        print(result[0][i],result[1][i],result[2][i])
#------------------------print_stock_price end
            
#해당 종목의 코드와 50페이지를 입력
stock_code = '005930'
pages = 50

#날짜, 종가, 거래량이 최근순으로 출력
print_stock_price(stock_code,pages)

'''
Matplotlib 기본 사용

Matplotlib 라이브러리를 이용하여 그래프를 그리는 일반적인 방밥

#pyplot 소개

matplotlib.pyplot은
Matplotlib을 MATLAB과 비슷하게 동작하도록 명력어 스타일의 함수의 모음
각각의 pyplot 함수를 사용해서 그림(figure)에 변화를 줄 수 있다.
예를 들어
그림을 만들어서 플롯 영역을 만들고
몇개의 라인을 플롯하고
(label)라벨들로 꾸미는 등의 일을 할 수 있다.
'''

# 기본 그래프
'''
pyplot으로 어떤 값들을 시각화하는 것은 매우 간단.
pyplot.polt()에 하나의 리스트를 입력함으로써 그래프가 그려진다.

matplotlib은 리스트의 값들이 y 값들이라고 가정하고
x값들 ([1,2,3,4])을 자동으로 만들어낸다.
'''
import matplotlib.pyplot as plt

plt.plot([1,2,3,4])
plt.ylabel('y-label')
plt.show()  

'''
plot()은 다재다능한(versatile)한 명령어여서,
임의의 개수의 인자를 받을 수 있다,
예를 들어 아래와 같이 입력하면 x와 y 값을 그래프로 나타낼수 있다.
'''
plt.plot([1,2,3,4],[1,4,9,16])

# 스타일 지정하기

'''
x,y값 인자에 대해
색상과 선의 형태를 지정하는 포맷 문자열을 세번쨰 인자에 입력할 수 있다.
디폴드 포맷 문자열은 'b-'인데 파란색의 선(line,'-')을 의미
아래의 'ro'는 빨간색의 원형(circle,'o') 마커를 의미
'''

import matplotlib.pyplot as plt
plt.plot([1,2,3,4],[1,4,9,16],'ro')

#axis()를 이용하여 축의 [xmin,xmax,ymin,ymax]범위를 지정
plt.axis([0,6,0,20])
plt.show()


#여러 개의 그래프 그리기
'''
matplotlib에서 리스트만 가지로 작업하는 것은 제한적이기 떄문에,
일반적으로 Numpy 어레이를 이용
사실, 모든 시퀸스는 내부적으로 Numpy 어레이로 반환 된다.
'''
#다양한 포맷스타일의 여러 개의 라인을 하나의 그래프로 그리기

import matplotlib.pyplot as plt
import numpy as np

#200ms 간격으로 균일한 샘플된 시간
t=np.arange(0.,5.,0.2)


#빨간 대쉬, 파란 사각형 녹색 삼각형

plt.plot(t,t,'r--',t,t**2,'bs',t,t**3,'g^')
plt.show()     

'''
Matplotlib라벨 설정하기

xlable(), ylabel() 함수를 사용해서
그래프의 x,y축에 대한 라벨을 설정 할 수 있다.

plt.polt([1,2,3,4],[1,4,9,16]) 

xlable(), ylabel()에 텍스트를 입력해주면 각각의 축에 라벨이 나타난다.
'''
import matplotlib.pyplot as plt
plt.plot([1,2,3,4],[1,4,9,16])
plt.xlabel('X-Label')
plt.ylabel('Y-Label')
plt.axis([0,5,0,20])
plt.show()

'''
axis()에 [xmin,xmax,ymin,ymax]의 형태로 x,y축의 범위를 지정
입력 리스트는 꼭 네개의 값 [xmin,xmax,ymin,ymax]이 있어야 한다.

입력값이 없으면 데이터에 맞게 자동(Auto)으로 범위를 지정.
'''


'''
Matplotlib 색깔 지정하기
자주 사용하는 색깔 외에도 다양한 색상을 지정할 수 있다.
polt()에 color='springgreen' 과 같이 입력해주면 색상이 적용된다..
'''
import matplotlib.pyplot as plt
plt.plot([1,2,3,4],[1,4,9,16],color='springgreen')
plt.xlabel('X-Label')
plt.ylabel('Y-Label')
plt.axis([0,5,0,20])
plt.show()

'''
Matplotlib 색깔 지정하기2
16진수 코드(hex code)로도 색깔을 지정할 수 있다.
색깔, 마커와 선의 종류까지 모두 지정

색깔은 '#e35f62'와 같이 16진수로, 마커는 circle,
선의 종류는 대쉬(dashed)로 지정
'''
import matplotlib.pyplot as plt
plt.plot([1,2,3,4],[1,4,9,16], color='#e35f62', marker='o', linestyle='--')
plt.xlabel('X-Label')
plt.ylabel('Y-Label')
plt.axis([0,5,0,20])
plt.show()
'''
Matplotlib 여러 곡선 그리기
세 개의 곡선을 하나의 그래프에 그리기
'''
import matplotlib.pyplot as plt
import numpy as np
'''
Numpy를 사용해서 array를 생성
numpy.array()
주어진 각격에 따라 균일한 array를 생성한다
'''
a=np.arange(5)#범위 [0 1 2 3 4]
b=np.arange(1,5) # [1 2 3 4]
c=np.arange(2,10,2)# [2 4 6 8]

print(a) 
print(b)
print(c)

'''
array a는 [0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8]
'''
a = np.arange(0,2,0.2)
'''
plot() 에 x 값, y 값, 스타일을 순서대로 세 번씩 입력하면, 
세 개의 곡선 (y=x, y=x^2, y=x^3)이 동시에 그려진다.
'''

plt.plot(a,a,'r--',
         a,a**2,'bo',
         a,a**3,'g-')
'''
'r--':빨간색의 대쉬 스타일 선
'bo' : 파란색의 circle 마커
'g-' :녹색의 대쉬-닷 스타일 선을 의미
'''
#세 개의 곡선의 세세한 스타일을 설정 할 수 있다.
import matplotlib.pyplot as plt
import numpy as np

a = np.arange(0, 2, 0.2)

# 첫 번째 곡선의 스타일은 'bo'로,
plt.plot(a, a, 'bo')

# 두 번째 곡선은 color='#e35f62', marker='*', linewidth=2로,
plt.plot(a, a**2, color='#e35f62', marker='*', linewidth = 2)

# 세 번째 곡선은 color='springgreen, marker='^', markersize=9로
plt.plot(a, a**3, color='springgreen', market='^', markersize=9)


'''
Matplotlib 그리드와 틱 설정하기

grid() 와 tick_parms()를 이용해서
그래프의 그리드와 틱의 스타일을 설정할 수 있다.
'''
import matplotlib.pyplot as plt
import numpy as np

a = np.arange(0, 2, 0.2)

# 첫 번째 곡선의 스타일은 'bo'로,
plt.plot(a, a, 'bo')

# 두 번째 곡선은 color='#e35f62', marker='*', linewidth=2로,
plt.plot(a, a**2, color='#e35f62', marker='*', linewidth = 2)

# 세 번째 곡선은 color='springgreen, marker='^', markersize=9로
plt.plot(a, a**3, color='springgreen', market='^', markersize=9)

'''
그리드가 표시되도록하려면,
grid()의 첫번째 파라미터를 True로 설정

axis='y'로 설정하면 y축의 그리드만 표시

alpha는 투명도를 설정합니다
0으로 설정하게되면 투명
1은 불투명

linestyle을 대쉬(Dached)로 설정
'''
plt.grid(True, axis='y', color='gray', alpha=0.5, linestyle='--')

'''
tick_parms() 를 이용해서 그래프의 틱에 관련되 설정을 할 수 있다
 
axis='both'로 설정하면 x,y축의 틱에 모두 적용
 
direction='in'으로 틱의 방향을 그래프 안쪽으로 설정
 
틱의 길이를 3만큼하고
틱과 라벨의 거리를 6만큼
틱 라벨의 크기를 14로 설정
'''
plt.tick_params(axis='both', direction='in', length=3, pad=6, labelsize=14)

plt.show()

# Matplotlib 타이틀 설정하기
# title() 을 이용해서 그래프의 제목(타이틀)을 설정.
'''
plt.title()을 이용해서 그래프의 타이틀을 'Sample graph'로 설정.
'''
plt.title('Sample graph')
plt.show()

# 2 - 위치와 오프셋
plt.title('Sample graph', loc = 'right', pad=20)
'''
loc ='right'로 설정하면,
타이틀이 그래프의 오른쪽 위에 나타나게 된다.

'left', 'center', 'right'로 설정할 수 있으며
디폴트는 'center'

pad = 20 은
타이틀과 그래프와의 간격(오프셋)을 포인트 단위로 설정.
'''
# 3- 폰트 설정
'''
fontdict 에 딕셔너리 형태로 폰트에 대한 설정을 입력할 수 있다.
'fontsize'를 16으로, 'fontweight'를 'bold'로 설정.

'fontsize'는 포인트 단위의 숫자를 입력하거나,
'smaller', 'x-large' 등의 상대적인 설정을 할 수 있다.

'fontweight'에는 'normal', 'bold', 'heavy', 'light', 'ultrabold', 'ultralight'의
설정을 할 수 있다.
'''
title_font = {
    'fontsize':16,
    'fontweight':'bold'
    }
plt.title('Sample graph',fontdict=title_font, loc = 'left', pad=20)

# Marplotlib 막대그래프 그리기
'''
bar() 함수를 이용해서 막대 그래프(bar graph)를 그릴 수 있다.
연도별 값을 갖는 데이터를 막대 그래프로 플롯.
'''
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(3)

# years는 x축에 표시될 연도이고, values는 y값.
years = ['2017', '2018', '2019']
values = [100, 400, 900]

# 먼저 bar() 함수에 x(=[0, 1, 2])와 값(=[100, 400, 900])을 입력.
plt.bar(x, values)
values = [100, 400, 900]

# 먼저 bar() 함수에 x(=[0,1,2])와 값(=[100,400,900])을 입력.
plt.bar(x, values)

'''
xticks()에 x와 years를 입력해주면,
x축에 '2017', '2018', '2019'가 순서대로 표시된다.

'''
plt.xticks(x,years)
plt.show()

'''
막대그래프에도
막대와 테두리의 색, 두께 등 다양한 스타일을 적용할 수 있다.

우선 bar() 함수에 x, y(=values)값을 입력.
'''

plt.bar(x, values,
        width=0.6,
        align='edge',
        color='springgreen',
        edgecolor='gray',
        linewidth=3,
        tick_label=years,
        log=True)

'''
barh()를 이용하면 수평 막대 그래프를 그릴 수 있다.
'''
y = np.arange(3)
plt.barh(y,values,
         height=-0.6,
         color='springgreen',
         edgecolor='gray',
         linewidth=3,
         tick_label=years,
         log=False)
plt.show()

# https://matplotlib.org/contents
# 여기서 필요한 모양의 그래프들과 소스를 확인 및 다운할 수 있다.


# Numpy 난수 생성(Random 모듈)
# 난수 생성에 활용할 수 있는 Numpy의 random 모듈(numpy.random)

# 1- random.rand() : 주어진 형태의 난수를 생성.
import numpy as np

# 예제1
'''
만들어진 난수 array는 주어진 값에 의해 결정되며,
(0,1) 범위에서 귬일한 분포를 갖는다.
'''
a = np.random.rand(5)
print(a)
'''
결과 : [0.75004981 0.21139253 0.45048688 0.94381202 0.00407233]
'''

'''
random.rand() 주어진 형태의 난수 array를 생성

random.randint() [최저값, 최대값]의 범위에서 임의의 정수

random.randn() 표준정규분포(standard normal distribution)를 갖는 난수를 반환

random.standard_normal() : randn()과 standard_normal() 은 기능이 비슷하지만, 
                        standard_normal()은 튜플을 인자로 받는다는 점에서 차이가 있다.
                        
random.random_sample() : [0.0, 1.0) 범위의 임의의 실수를 반환
                          
random.choice() : 주어진 1차원 어레이에서 임의의 샘플을 생성

random.seed() : 난수 생성에 필요한 시드를 정한다. 코드를 실행할 때 마다 똑같은 난수가 생성
'''

# Matplotlib 산점도 그리기
# scatter() 를 이용해서 산점도(scatter plot)를 그릴 수 있다.

import matplotlib.pyplot as plt
import numpy as np
'''
np.random.seed() 를 통해서 난수 생성의 시드를 설정하면,
같은 난수를 재사용할 수 있다.

seed() 에 들어갈 파라미터는
0에서 4294967295 사이의 정수여야 한다.
'''

# 1 - random.rand() : 주어진 형태의 난수를 생성.
import numpy as np

'''
만들어진 난수 array는 주어진 값에 의해 결정되며,
[0, 1) 범위에서 균일한 분포를 갖는다.
'''

a = np.random.rand(5)
print(a) # 결과 : [0.44859207 0.21580016 0.1010523  0.0087913  0.7022354 ]
    
b = np.random.rand(2, 3)
print(b)
'''
결과 : 
    [[0.70633485 0.24791576 0.15788335]
     [0.69769852 0.71995667 0.25774443]]
'''
'''
x, y의 위치, 마커의 색(colors)과 면적(area)을 무작위로 지정.

예를 들어, x는
[0.7000003, 0.721613,... , 0.234654, 0.3216541]으로
0에서 1사이의 무작위한 50개의 값을 갖는다.
'''

N = 50
x = np.random.rand(N)
y = np.random.rand(N)
colors = np.random.rand(N)
area = (30 * np.random.rand(N))**2  

'''
scatter()에 x,y 위치를 입력
s 는 마커의 면적을
c 는 마커의 색을 지정
alpha는 마커색의 투명도를 결정
'''
plt.scatter(x,y, s=area,c=colors, alpha=0.5)
plt.show()

'''
matplotlib 히스토그램 그리기
hist()를 이용해서 히스토그램을 그리기

1- 값입력하기
'''

import matplotlib.pyplot as plt
'''weight 리스트는 몸무게 값을 나타낸다.'''
weight = [68,81,64,56,78,74,61,77,66,68,59,
          71,80,59,67,81,69,73,69,74,70,65]
''' hist()함수에 리스트의 형태로 값들을 직접 입력해주면 된다.'''
plt.hist(weight)

2- 여러개의 히스토그램 그리기

import matplotlib.pyplot as plt
import numpy as np
'''
Numpy의 np.random.randn()와
np.random.standard_normal(), np.random.rand() 함수를 이용해서
임의의 값들을 생성
'''
# array a는 표준편차 2.0, 평균 1.0을 갖는 정규분포
a = 2.0 * np.random.randn(10000) + 1.0

# array b 는 표준정규분포를 따른다.
b = np.random.standard_normal(10000)

# array c는 -10.0 에서 10.0 사이의 균일한 분포를 갖는 5000개의 임의의 값.
c = 20.0*np.random.rand(5000) - 10.0

'''
세 개의 분포를 동시에 그래프에 나타내기.
plt.hist()

bins는 몇 개의 영역으로 쪼갤지를 설정.

density=True 로 설정해주면,
밀도함수가 되어서 막대의 아래 면적이 1이 된다.

alpha는 투명도를 의미합니다. 0.0에서 1.0사이의 값을 갖는다.
histtype 을 'step'으로 설정하면 막대 내부가 비어있고,
'stepfilled'로 설정하면 막대 내부가 채워진다.
'''
plt.hist(a, bins=100, density=True, alpha=0.7, histtype='step')
plt.hist(a, bins=50, density=True, alpha=0.5, histtype='stepfilled')
plt.hist(a, bins=100, density=True, alpha=0.9, histtype='step')

plt.show()

import matplotlib.pyplot as plt
import numpy as np

a = np.random.rand(1000)
b = np.random.rand(10000)
c = np.random.rand(100000)

plt.hist(a, bins=100, density=True, alpha=0.5, histtype='step', label='n=1000')
plt.hist(b, bins=100, density=True, alpha=0.75, histtype='step', label='n=10000')
plt.hist(c, bins=100, density=True, alpha=1.0, histtype='step', label='n=100000')
plt.legend()

#----------정수반환
import matplotlib.pyplot as plt
import numpy as np

''' a는 [0,10) 범위의 임의의 정수 1000개'''
a = np.random.randint(0, 10, 1000)
''' b는 [0,10) 범위의 임의의 정수 1000개'''
b = np.random.randint(10, 20, 1000)
''' c는 [0,10) 범위의 임의의 정수 1000개'''
c = np.random.randint(0, 20, 1000)

plt.hist(a, bins=100, density=False, alpha=0.5, histtype='step', label='0<=randint<10')
plt.hist(b, bins=100, density=False, alpha=0.75, histtype='step', label='10<=randint<20')
plt.hist(c, bins=100, density=False, alpha=1.0, histtype='step', label='0<=randint<20')
plt.legend()
plt.show()

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 15:18:46 2020

@author: admin
"""


# Matplotlib 3차원 산점도 그리기
'''
scatter() 를 이용해서 3차원 산점도(3D Scatter plot)를 그리기.

3차원 그래프를 그리기 위해서
from mpl_toolkits.mplot3d import Axes3D 를 추가.

이 부분은 matplotlib 3.1.0 버전부터는
디폴트로 포함되기 때문에 적어주지 않아도 된다.
'''

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

'''
x, y의 위치, 마커의 색(colors)과 면적(area) 을 무작위로 지정.
'''
n = 100
xmin, xmax, ymin, ymax, zmin, zmax = 0, 20, 0, 20, 0, 50
cmin, cmax = 0, 2

xs = (xmax-xmin)*np.random.rand(n) + xmin
ys = (xmax-xmin)*np.random.rand(n) + ymin
zs = (xmax-xmin)*np.random.rand(n) + zmin
color = (xmax - xmin) * np.random.rand(n) + cmin


# rcParams 를 이용해서 figure의 사이즈를 설정
plt.rcParams["figure.figsize"] = (6,6) # 3차원 박스를 6분할로 나눈다.(가로, 세로)
fig = plt.figure()

'''
3D axes를 만들기 위해
add_subplot() 에 projection= '3d'키워드를 입력
'''
ax = fig.add_subplot(111, projection='3d')

'''
scatter() 함수에 x, y, z위치를 array의 형태로 입력
마커(marker)의 형태를 원형(circle)으로 설정
cmap = 'Greens'를 통해 colormap을 녹색 계열로 설정
'''

ax.scatter(xs, ys, zs, c=color, marker='o', s=15, cmap='Greens')

### Googletrans 기본 사용 ###
'''
Googletrans 라이브러리를 이용해서
간단한 문장을 특정 언어로 (구글) 번역하고, 언어를 자동 감지하는 기능을 사용.
'''

# 1. 번역하기
# googletrans에서 Translator를 불러오기

from googletrans import Translator

translator = Translator()

# translate() 에 번역할 문장을 입력해주면, 아래 같은 결과를 출력.
print(translator.translate('안녕하세요'))

### Googletrans - 파이썬을 위한 구글 번역 API ###
'''
Googletrans는
구글 번역 API(Google Translate API)를 구현한 파이썬 라이브러리.

파이썬과 Googletrans 를 이용해서
무료로 그리고 무제한으로 구글의 번역 기능을 사용할 수 있다.
'''

# Googletrans 설치
'''
pip install googletrans
conda install -c conda-forge googletrans
명령 프롬프트에서 pip 또는 conda를 통해서 설치를 진행
'''

### Googletrans 기본 사용 ###
'''
Googletrans 라이브러리를 이용해서
간단한 문장을 특정 언어로 (구글) 번역하고, 언어를 자동 감지하는 기능을 사용.
'''

# 1. 번역하기
# googletrans에서 Translator를 불러오기

from googletrans import Translator

translator = Translator()

# translate() 에 번역할 문장을 입력해주면, 아래 같은 결과를 출력.
print(translator.translate('안녕하세요'))
# 결과 : Translated(src=ko, dest=en, text=Hi, pronunciation=None, extra_data="{'translat...")
# 기본값인 영어로 번역되고(dest=en) 그 값은 Hi 이다(text=Hi)
print(translator.translate('안녕하세요').text)
# 이 함수가 가지고 있는 속성인 text 만 가지고 나와서 'Hi'를 출력한다.

'''
LANGUAGES = {    'af': 'afrikaans',    'sq': 'albanian',    'am': 'amharic',
             'ar': 'arabic',    'hy': 'armenian',    'az': 'azerbaijani',
             'eu': 'basque',    'be': 'belarusian',    'bn': 'bengali',
             'bs': 'bosnian',    'bg': 'bulgarian',    'ca': 'catalan',
             'ceb': 'cebuano',    'ny': 'chichewa',    'zh-cn': 'chinese (simplified)',
             'zh-tw': 'chinese (traditional)',    'co': 'corsican',    'hr': 'croatian',
             'cs': 'czech',    'da': 'danish',    'nl': 'dutch',    'en': 'english',
             'eo': 'esperanto',    'et': 'estonian',    'tl': 'filipino',
             'fi': 'finnish',    'fr': 'french',    'fy': 'frisian',    'gl': 'galician',
             'ka': 'georgian',    'de': 'german',    'el': 'greek',    'gu': 'gujarati',
             'ht': 'haitian creole',    'ha': 'hausa',    'haw': 'hawaiian',    'iw': 'hebrew',
             'hi': 'hindi',    'hmn': 'hmong',    'hu': 'hungarian',    'is': 'icelandic',
             'ig': 'igbo',    'id': 'indonesian',    'ga': 'irish',    'it': 'italian',
             'ja': 'japanese',    'jw': 'javanese',    'kn': 'kannada',    'kk': 'kazakh',
             'km': 'khmer',    'ko': 'korean',    'ku': 'kurdish (kurmanji)',    'ky': 'kyrgyz',
             'lo': 'lao',    'la': 'latin',    'lv': 'latvian',    'lt': 'lithuanian',
             'lb': 'luxembourgish',    'mk': 'macedonian',    'mg': 'malagasy',    'ms': 'malay',
             'ml': 'malayalam',    'mt': 'maltese',    'mi': 'maori',    'mr': 'marathi',
             'mn': 'mongolian',    'my': 'myanmar (burmese)',    'ne': 'nepali',    'no': 'norwegian',
             'ps': 'pashto',    'fa': 'persian',    'pl': 'polish',    'pt': 'portuguese',
             'pa': 'punjabi',    'ro': 'romanian',    'ru': 'russian',    'sm': 'samoan',
             'gd': 'scots gaelic',    'sr': 'serbian',    'st': 'sesotho',    'sn': 'shona',
             'sd': 'sindhi',    'si': 'sinhala',    'sk': 'slovak',    'sl': 'slovenian', 
             'so': 'somali',    'es': 'spanish',    'su': 'sundanese',    'sw': 'swahili',
             'sv': 'swedish',    'tg': 'tajik',    'ta': 'tamil',    'te': 'telugu',    'th': 'thai',
             'tr': 'turkish',    'uk': 'ukrainian',    'ur': 'urdu',    'uz': 'uzbek',
             'vi': 'vietnamese',    'cy': 'welsh',    'xh': 'xhosa',    'yi': 'yiddish',
             'yo': 'yoruba',    'zu': 'zulu',    'fil': 'Filipino',    'he': 'Hebrew'}
'''

저작자표시 비영리 변경금지

':: IT > python' 카테고리의 다른 글

[파이썬] 전처리 20200423 (0)	2020.04.27
[파이썬]수집된 데이터 형식 확인 및 로컬 전처리 (0)	2020.04.23
[python] Pandas 를 사용한 데이터 분석 기초, 크롤링 (0)	2020.04.21
[파이썬] 실전 프로젝트 (0)	2020.04.10
[파이썬] 함수와 모듈, 내장함수,클래스 (0)	2020.04.10

:: GO치의 에브리데이 일기장::

[파이썬] 분석데이터 수집 및 분석(전처리)

':: IT > python' 카테고리의 다른 글

+ Recent posts

티스토리툴바