[파이썬] 데이터 전처리 20200424

GOGO치삼 2020. 4. 27. 10:35

2020. 4. 27. 10:35


# 워드 클라우드
# WordCloud 설치 : pip install wordcloud
from wordcloud import WordCloud,STOPWORDS

import numpy as np
from PIL import Image

# alice.txt
text = open('./data/09. a_new_hope.txt').read()
alice_mask = np.array(Image.open('./data/09. stormtrooper_mask.png'))

stopwords = set(STOPWORDS)
stopwords.add("said")

import matplotlib.pyplot as plt
import platform


path = "c:/Windows/Fonts/malgun.ttf"


# 폰트 설정(특히 한글 부분)
from matplotlib import font_manager, rc, pyplot as plt
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')
    
plt.figure(figsize=(8, 8))
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis('off')
plt.show()

wc = WordCloud(background_color='white', max_words=2000, mask=alice_mask, stopwords=stopwords1)
wc = wc.generate(text)

#wc.words_
plt.figure(figsize=(12, 12))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

#starwars
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

stopwords1 = set(STOPWORDS)
stopwords1.add("EXT")
stopwords1.add("INT")
stopwords1.add("Luke")
# =============================================================================
# 
# =============================================================================

#a_new_hope.txt
text = open('.data/09. a_new>hope.txt').read()
text = text.replace('HAN', 'Han')
text = text.replace("LUKE'S", "Luke")

mask = np.array(Image.oepn('.data/09. stormtrooper_mask.png'))

stopwords2 = set(STOPWORDS)
stopwords2.add("int")
stopwords2.add("ext")

wc = WordCloud(max_words=1000, mask=alice_mask, stopwords=stopwords2, margin=10, random_state=1).generate(text)
# =============================================================================
# 
# =============================================================================
import random
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl(0, 0%%, %d%%)' % random.randint(60, 100)

plt.figure(figsize=(12,12))
plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3),interpolation='bilinear')

plt.axis('off')
plt.show()

# -*- coding: utf-8 -*-
"""
Created on Fri Apr 24 11:03:15 2020

@author: USER
"""


# 꼬꼬마에 대하여
from konlpy.corpus import kolaw
from konlpy.tag import Kkma
from konlpy.utils import concordance

#'''
#헌법 관련된 텍스트 불러오기
constitution = kolaw.open('constitution.txt').read()

print(constitution)

#몇번째 줄에 '민주'라는 단어가 있는지 찾아줌
r=concordance(u'민주',constitution, show=False)
print("show=False =>", r)

#고려사항 : 정확성,속도
from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

text = u'네, 안녕하세요. 반갑습니다.'

#문장단위로 찾아냄
text_s = kkma.sentences(text)
print("text_s => ",  text_s)

print("type(text_s) =>",type(text_s))
print("text_s[0]=>",text_s[0])
print("text_s[0]=>",text_s[-1])

#tagset : 형식들에 대한 정보 파악
kkma = Kkma()
print(kkma.tagset)

text = "자연어처리는 재미있습니다.\ 그러나 한국어 분석은 쉽지 않습니다."

# 명사,추출기, Noun extractor
text_nouns = kkma.nouns(text)
print(text_nouns)

#형태소 해석, Parse phrase to morphemes
#나중에 조사들을 추출해서 버리고
#의미있는 것들만 분석에 활용한다
text_morphs = kkma.morphs(text)
print(text_morphs)

# POS태그
pos_tagger = kkma.pos(text)
print(pos_tagger)

print(len(pos_tagger))
print(type(pos_tagger))
print(type(pos_tagger[0]))

#flatten=False : 문장단위에 따라서 묶음이 달라짐
# True일 때는 하나하나 다 풀어서 저장
pos_tagger_f = kkma.pos(text, flatten=False)

print(pos_tagger_f)
print(len(pos_tagger_f))
print(type(pos_tagger_f))
print(type(pos_tagger_f[0]))

# =============================================================================
# 
# =============================================================================
constitution = kolaw.open('constitution.txt').read()
pos_const = kkma.pos(constitution)

print(len(pos_const))
# 보통명사만 추출->가나다 순으로
pos_const_NNG = [ word[0] for word in pos_const
                                     if word[1]=='NNG']

print(len(pos_const_NNG))

pos_const_NNG.sort()
print(pos_const_NNG[:10])
print(pos_const_NNG[-10:])


# 모든 명사 추출
NN_list = [ 'NN', 'NNB', 'NNG', 'NNM', 'NNP', 'NP']

# 모든명사의 개수 파악하기
pos_const_NN = [ word[0] for word in pos_const
                                if word[1] in NN_list]
print(len(pos_const_NN))

pos_const_NN.sort()
print(pos_const_NN[:10])
print(pos_const_NN[-10:])

#set로 묶어서 유니크한 값 찾기
s=set(pos_const_NN)
print(len(s))

#헌법 관련된 텍스트 불러오기
constitution = kolaw.open('constitution.txt').read()

print(constitution)


#몇번째 줄에 '민주'라는 단어가 있는지 찾아줌
r=concordance(u'All',constitution, show=True)
print("show=False =>", r)
# =============================================================================
# 
# =============================================================================
def getNounCnt(pos_list):
    noun_cnt = {}
    for noun in pos_list:
        if noun_cnt.get(noun):
            noun_cnt[noun] += 1
        else:
            noun_cnt[noun] = 1
    return noun_cnt

noun_dict = getNounCnt(pos_const_NN)
print(len(noun_dict))
print(noun_dict)
# =============================================================================
# pos list=['가격','가부','가입','가족','가족','가치','각급','각급','각급']
# noun_cnt={'가격':1, '가부':1,'가입':2}
# noun='가입'
# =============================================================================
# =============================================================================
# 
# =============================================================================
# =============================================================================
# 
# =============================================================================

constitution = kolaw.open('constitution.txt').read()
pos_const = kkma.pos(constitution)

from collections import Counter
# most_common : 가장 많이 나온 것들만 뽑아냄
counter = Counter(pos_const)
print(counter.most_common(10))
# =============================================================================
# 
# =============================================================================
'''
noun_dict = getNounCnt(pos_const_NN)
'''
# 의미 있는 것을 찾기 위해서 명사들 중에서
# 가장 많이 나온 것들을 뽑아냄
noun_dict = getNounCnt(pos_const_NN)
counter = Counter(noun_dict)
print(counter.most_common(10))
# =============================================================================
# 
# =============================================================================
# 육아휴직관련 법안 대한민국 국회 제 1809890호 의안
import nltk
from konlpy.corpus import kobill

files_ko = kobill.fileids()
doc_ko = kobill.open('1809890.txt').read()


doc_ko
# =============================================================================
# 
# =============================================================================
# Twitter
from konlpy.tag import Twitter;

t = Twitter()
tokens_ko = t.nouns(doc_ko)
tokens_ko

ko = nltk.Text(tokens_ko, name='대한민국 국회 의안 제 1809890호')

print(len(ko.tokens))
print(len(set(ko.tokens)))
ko.vocab()

# chart 1
plt.figure(figsize=(12,6))
ko.plot(50)
plt.show()

stop_words =['.','(',')',',',"'",'%','-','X',').','의','자','에','안','번','호','을','이','다','만','로','가','를']
ko = [each_word for each_word in ko if each_word not in stop_words]
ko

# chart 2
ko = nltk.Text(ko, name='대한민국 국회 의안 제 1809890호')

plt.figure(figsize=(12,6))
ko.plot(50)
plt.show()

# =============================================================================
# 
# =============================================================================

# chart 3
ko.count('초등학교')
plt.figure(figsize=(12,6))
ko.dispersion_plot(['육아휴직','초등학교','공무원'])
ko.concordance('초등학교')

data = ko.vocab().most_common(150)

# wordcloud
# for mac : font_path='/Library/Fonts/AppleGothic.ttf'
wordcloud = WordCloud(font_path='c:/Windows/Fonts/malgun.ttf', relative_scaling = 0.2, background_color='white',).generate_from_frequencies(dict(data))
plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# -*- coding: utf-8 -*-
"""
Created on Fri Apr 24 09:15:29 2020

@author: USER
"""


#### 한글 자연어 처리 기초
# Kkma
from lib2to3.btm_utils import tokens
from konlpy.tag import Kkma

kkma = Kkma()

print(kkma.sentences('한국어 분석을 시작합니다. 재미있어요~~'))
print(kkma.nouns('한국어 분석을 시작합니다. 재미있어요~~'))
print(kkma.pos('한국어 분석을 시작합니다. 재미있어요~~'))

# =============================================================================
# 
# =============================================================================
# Hannanum
from konlpy.tag import Hannanum

print(hannanum.nouns('한국어 분석을 시작합니다 재미있어요~~'))
print(hannanum.morphs('한국어 분석을 시작합니다 재미있어요~~'))
print(hannanum.pos('한국어 분석을 시작합니다 재미있어요~~'))
# =============================================================================
# 
# =============================================================================
# Twitter
from konlpy.tag import Okt

t = Okt()

print(t.nouns('한국어 분석을 시작합니다 재미있어요~~'))
print(t.morphs('한국어 분석을 시작합니다 재미있어요~~'))
print(t.pos('한국어 분석을 시작합니다 재미있어요~~'))
# =============================================================================
# 
# =============================================================================
# Komoran()
# Komoran
from konlpy.tag import Komoran

k = Komoran()
print(k.nouns('한국어 분석을 시작합니다 재미있어요~~'))
print(k.morphs('한국어 분석을 시작합니다 재미있어요~~'))
print(k.pos('한국어 분석을 시작합니다 재미있어요~~'))
# =============================================================================
# 
# =============================================================================
#성능비교
import time
sentence = u"감정노동자 보호법은 사업주로 하여금 감정노동으로부터 근로자를 보호"
sentences = [sentence] * 100

from konlpy.tag import Hannanum, Kkma, Komoran, Okt

morphs_processors= [('Hannanum', Hannanum()), ('Kkma', Kkma()),'Komoran', Komoran(),('Okt',Okt())]

for name, morphs_processors in morphs_processors:
    start_time = time.time()
    morphs = [morphs_processor.morphs(sentence) for sentence in sentences]
    elapsed_time = time.time() - start_time
    print('morphs_processor name = %20s, % 5f secs' % (name,elapsed_time))

'''
빠른 속도와 보통의 정확도를 원한다면 "Komoran" 또는 "Hannanum"
속도는 느리더라도 정확하고 상세한 품사 정보를 원한다면 "Kkma"

어느 정도의 띄어쓰기 되어 있는 "인터넷" 영화평/상품명을 처리할 땐 "Okt"
(만약 띄어쓰기가 없다면 느린 처리속도는 감수해야함)
'''

'''
언어를 분석할때, stopwords 라는 용어가 나온다.
stopwords 또는 불용어 란, 우리가 언어를 분석할 때,
의미가 있는 단어와, 의미가 없는 단어나 조사 등이 있다.

이렇게 의미가 없는 것들을 stopwords 라고 한다.

예를 들어서, 다음 문장이 있으면,
"Family is not an important thing. It's everything."

Family, important, thing, everything은 의미가 있다고 보고,
나머지 아래 같은 것들은 의미가 없다고 판단하여 stopwords로 정의한다.
'''

# Naive Bayes Classifier의 이해 - 한글
# 문장의 유사도 측정
'''
'메리가 좋아'
'고양이도 좋아'
'난 수업이 지루해'
'메리는 이쁜 고양이야'
'난 마치고 메리랑 놀거야'
'''

from konlpy.tag import Okt
from nltk.tokenize import word_tokenize
import nltk

pos_tagger = Okt()

train = [('메리가 좋아', 'pos'),
         ('고양이도 좋아', 'pos'),
         ('난 수업이 지루해', 'neg'),
         ('메리는 이쁜 고양이야', 'pos'),
         ('난 마치고 메리랑 놀거야', 'pos')]

all_words = set(word.lower()
                for sentence in train
                for word in word_tokenize(sentence[0]))

all_words

t = [({word: (word in word_tokenize(x[0]))
        for word in all_words}, x[1]) for x in train]

t

classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

'''
Numpy란 Numerical Python 의 약자로
대규모 다차원 배열과 행렬 연산에 필요한 다양한 함수를 제공한다.
데이터 분석할 때 사용되는 다른 라이브러리 pandas와 matplotlib의 기반이 된다.
기본적으로 array라는 단위로 데이터를 관리하는데, 행렬 개념으로 생각하면 된다.

- numpy 특징 : 일반 list에 비해 빠르고 메모리에 효율적이다.
선형대수와 관련된 다양한 기능을 제공하고,
for문, while문 같은 반복문 없이 데이터 배열에 대한 처리를 지원한다.

 - numpy가 빠른 이유 : numpy는 메모리에 차례대로 생성/할당을 해준다.
 반면 기존의 List는 이 값(value)가 어디에 있는지 주소만 저장을 해놓고 그 주소를 알려준다.
 그래서 list를 for문을 돌리면 그 주소마다 하나하나씩 다 찾아가면서 연산을 해줘야 하는데,
 (파이썬은 변수형태로 들어가고 numpy는 메모리를 직접관리)
 numpy는 같은 곳에 몰려 있기 때문에 연산이 더 빠르게 이뤄진다.
 
  --> 넘파이는 데이터들을 한군데 차곡차곡 모아서 관리하는게 특징이다. 그러나 파이썬의 리스트는
  데이터가 있는 번지수가 있어서 그 변수. 즉, 주소들만 기억하고 실제로 찾아내려면 그 번지를 이용해서 찾아다녀야 한다.
  
 - numpy 호출 : "import numpy as np"로 numpy를 호출하는데 모두 np라는 별칭(alias)로 호출하지만 특별한 이유는 없다.
 - numpy로 array 생성하는 방법 : ex)test_array = np.array([1,3,5,7], float)
  type(test_array[3])을 하면 4바이트씩 numpy.float64 라는 값이 반환된다.
  float32 같은 데이터 타입은 하나씩 모여서 메모리 블럭을 구성한다.
  32bit(비트) = 4byte(바이트)이다. (8bit 가 1byte)
'''
import numpy as np

test_array = np.array([1,3,5,7], float)
print(test_array) # [1. 3. 5. 7.]
print(type(test_array)) # <class 'numpy.ndarray'>

'''
ndarray의 구성 -> (4,)
ndarray의 shape : (type : tuple)
이건 1차원 벡터형식이라고 부른다.

vector는 1차원 행렬을 말하고 하나의 행에 열만 있는 것이다.
(위의 그림 예시에서는 1차원에 4개의 element만 있음)
각 숫자는 value(요소)라고도 부른다.
shape를 보는 코드 예시는 그림 상에 없지만 결과적으로 (4,)의 결과를 보여줄 것이다.
'''

matrix = [[1,2,3,4], [5,6,7,8], [9,10,11,12]]
matrix2 = np.array(matrix, int).shape # (3, 4)
type(matrix2) # tuple
type(matrix) # list

tensor = [[[1,2,3,4],[5,6,7,8],[9,10,11,12]],
          [[1,2,3,4],[5,6,7,8],[9,10,11,12]],
          [[1,2,3,4],[5,6,7,8],[9,10,11,12]]]

type(tensor) # list
np.array(tensor, int).shape # (3, 3, 4)
# 3차원 matrix, nparray의 shape(type : tuple)

np.array(tensor, int).ndim # number of dimension
# 3
np.array(tensor, int).size # data의 개수
# 36

'''
- Ndarray의 single element가 가지는 data type
- 각 element가 차지하는 memory의 크기가 결정됨
'''
np.array( [[1,2.6, 3.2], [4, 5.1, 6]], dtype=int)
#array([[1, 2, 3],
#       [4, 5, 6]])

np.array( [[1,2.6, 3.2], [4, "5", 6]], dtype=np.float32)
#array([[1. , 2.6, 3.2],
#       [4. , 5. , 6. ]], dtype=float32)

'''
각 요소마다 데이터 타입을 지정해주면 그 데이터 타입으로 변환이 되는 걸 볼 수 있다.

아래의 예시를 보면 여기는 실수형이고 여기는 string 타입인데 소수점 타입으로 바꾸면
???????????????????????????????????????????????????????????????????????????????
'''

# - nbyte : ndarray object의 메모리 크기 리턴

np.array([[1, 2.6, 3.2], [4, "5", 6]], dtype=np.float32).nbytes
# 24 -> 32bits = 4bytes -> 6 * 4 bytes
np.array([[1, 2.6, 3.2], [4, "5", 6]], dtype=np.float64).nbytes
# 48 -> 64bits = 8bytes -> 6 * 8 bytes
np.array([[1, 2.6, 3.2], [4, "5", 6]], dtype=np.int8).nbytes
# 6 -> 8bits = 1 bytes -> 6 * 1bytes


'''
하나의 value가 4바이트를 가지는데
요소가 6개 있으니까, 이게 메모리에서 차지하는 건 총 24바이트가 된다.
그 다음 타입은 하나가 8바이트이니까 48바이트를 차지한다.
'''

# - Array의 크기를 변경함(element의 개수는 동일)
t_matrix = [[1,2,3,4], [5,6,7,8]]
np.array(t_matrix).shape
#(2, 4)
np.array(t_matrix).reshape(8,)
#array([1, 2, 3, 4, 5, 6, 7, 8]) --> 2행으로 되어있는 것을 1행으로 변경하였다.
np.array(t_matrix).reshape(8,).shape
#(8,)


# - Array의 size만 같다면 다차원으로 자유로이 변형 가능
np.array(t_matrix).reshape(2, 4).shape
#(2, 4)
np.array(t_matrix).reshape(-1, 2).shape # -1 을 입력하면 알아서 행 갯수를 맞춰준다.
#(4, 2)
np.array(t_matrix).reshape(2,2,2)
#array([[[1, 2],
#        [3, 4]],
#       [[5, 6],
#        [7, 8]]])
np.array(t_matrix).reshape(2,2,2).shape
#(2, 2, 2)

# flatten : 다차원 array를 1차원 array로 변환
t_matrix = [ [[1,2], [3,4]], [[1,2], [3,4]], [[1,2],[3,4]]]
np.array(t_matrix).flatten()
# array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4])

# flatten : 다차원을 1차원으로만 만들어 주는 함수. 즉, reshape은 최소화할 열이나 행 둘중에 하나를 뿌려줘야 하는데 flatten은 1차원으로만됨

# indexing
a = np.array([[1,2.2,3], [4,5,6.3]],int)
print(a)

#[[1 2 3]
# [4 5 6]]

print(a[0,0]) # 1
print(a[0][0]) # 1
a[0,0]=7
print(a)
#[[7 2 3]
# [4 5 6]]

# slicing
# - list와 달리 행과 열 부분을 나눠서 slicing 이 가능함
# - matrix 부분 집합 추출할 때 유용
a = np.array([[1,2,3,4,5],[6,7,8,9,10]],int)
a[:,1:] # 전체 row의 1열 이상
#array([[ 2,  3,  4,  5],
#       [ 7,  8,  9, 10]])

a[1, 2:4] # 1row의 2~3열
#array([8, 9])

a[1:3] #1row ~ row 전체
#array([[ 6,  7,  8,  9, 10]])

a = np.array([[0,1,2,3,4], [5,6,7,8,9], [10,11,12,13,14]], int)
print(a)
#[[ 0  1  2  3  4]
# [ 5  6  7  8  9]
# [10 11 12 13 14]]

a[:,::2] # 행과 열은 내부 인덱스를 뽑는거고, 두 번째는 위치를 뽑는것
#array([[ 0,  2,  4],
#       [ 5,  7,  9],
#       [10, 12, 14]])

a[::2,::2] # ::은 스텝(간격)을 의미한다.
#array([[ 0,  2,  4],
#       [10, 12, 14]])

#배열을 만드는데 범위를 지정해서 만드는 방법
# - array의 범위를 지정하여, 값의 list를 생성하는 명령어

np.arange(20)#list의 range와 같은 역할, intege로 0부터 19까지 배열 추출
# array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

np.arange(0, 1, 0.2) #float 가능
#array([0. , 0.2, 0.4, 0.6, 0.8])

np.arange(20).reshape(4,5) # 만든 값을 reshape한다.
#array([[ 0,  1,  2,  3,  4],
#       [ 5,  6,  7,  8,  9],
#       [10, 11, 12, 13, 14],
#       [15, 16, 17, 18, 19]])

np.zeros(shape = (5,2), dtype = np.int8) # 5 by 2 zero matrix 생성, int8
#array([[0, 0],
#       [0, 0],
#       [0, 0],
#       [0, 0],
#       [0, 0]], dtype=int8)

np.ones(shape = (5,2), dtype = np.int8) # 5 by 2 one matrix 생성, int8
#array([[1, 1],
#       [1, 1],
#       [1, 1],
#       [1, 1],
#       [1, 1]], dtype=int8)

np.empty(shape=(3,2), dtype=np.int8)
#array([[1, 2],
#       [3, 4],
#       [5, 6]], dtype=int8)
# 실행할 때 마다 다른 값이 출력된다.

'''
empty는 주어진 shape대로 비어있는 것을 생성한다.
이런 식으로 array를 만드는데 메모리를 어느 정도 할당 시켜준다.
그런데 메모리에 기존에 있었던 값을 보여준다.

zeros나 ones는 0과 1로 메모리 할당 값을 초기화 시켜주는데
empty는 초기화시키지 않고 기존에 메모리에 있는 찌꺼기 그대로 보여준다.
'''
'''
empty는 주어진 shape대로 비어있는 것을 생성한다.
이런 식으로 array를 만드는데 메모리를 어느 정도 할당 시켜준다.
그런데 메모리에 기존에 있었던 값을 보여준다.

zeros나 ones는 0과 1로 메모리 할당 값을 초기화 시켜주는데
empty는 초기화시키지 않고 기존에 메모리에 있는 찌꺼기 그대로 보여준다.
'''

#기존 ndarray의 shape 크기만큼 1 or 0 or empty array 반환
t_matrix = np.arange(15).reshape(3,5)
np.ones_like(t_matrix) # 채워져있는 구조를 이용해서 1로 바꾼 것, 원본 반영되는게 아님
#array([[1, 1, 1, 1, 1],
#       [1, 1, 1, 1, 1],
#       [1, 1, 1, 1, 1]])

t_matrix1 = np.arange(15).reshape(3,5)
np.zeros_like(t_matrix1) # 채워져있는 구조를 이용해서 0으로 바꾼 것, 원본 반영되는게 아님
#array([[0, 0, 0, 0, 0],
#       [0, 0, 0, 0, 0],
#       [0, 0, 0, 0, 0]])

t_matrix2 = np.arange(15).reshape(3,5)
np.empty_like(t_matrix2)
#array([[ 0,  1,  2,  3,  4],
#       [ 5,  6,  7,  8,  9],
#       [10, 11, 12, 13, 14]])
#기존 ndarray의 shape 크기만큼 1 or 0 or empty array 반환
t_matrix = np.arange(15).reshape(3,5)
np.ones_like(t_matrix) # 채워져있는 구조를 이용해서 1로 바꾼 것, 원본 반영되는게 아님
#array([[1, 1, 1, 1, 1],
#       [1, 1, 1, 1, 1],
#       [1, 1, 1, 1, 1]])

t_matrix1 = np.arange(15).reshape(3,5)
np.zeros_like(t_matrix1) # 채워져있는 구조를 이용해서 0으로 바꾼 것, 원본 반영되는게 아님
#array([[0, 0, 0, 0, 0],
#       [0, 0, 0, 0, 0],
#       [0, 0, 0, 0, 0]])

t_matrix2 = np.arange(15).reshape(3,5)
np.empty_like(t_matrix2)
#array([[ 0,  1,  2,  3,  4],
#       [ 5,  6,  7,  8,  9],
#       [10, 11, 12, 13, 14]])


# 단위행렬 (i행렬)을 생성 n -> number of rows
np.identity(n=3, dtype=np.int8)
#array([[1, 0, 0],
#       [0, 1, 0],
#       [0, 0, 1]], dtype=int8)

np.identity(n=5) #정사각형 행렬
#array([[1., 0., 0., 0., 0.],
#       [0., 1., 0., 0., 0.],
#       [0., 0., 1., 0., 0.],
#       [0., 0., 0., 1., 0.],
#       [0., 0., 0., 0., 1.]])

np.eye(N=3, M=4, dtype=np.int) # N값과 M 값을 변경시켜서 직사각형 형태로 만들 수 있다.
#array([[1, 0, 0, 0],
#       [0, 1, 0, 0],
#       [0, 0, 1, 0]])

np.eye(4) # identity행렬과 같게 출력
#array([[1., 0., 0., 0.],
#       [0., 1., 0., 0.],
#       [0., 0., 1., 0.],
#       [0., 0., 0., 1.]])

np.eye(4) # identity행렬과 같게 출력
#array([[1., 0., 0., 0.],
#       [0., 1., 0., 0.],
#       [0., 0., 1., 0.],
#       [0., 0., 0., 1.]])

np.eye(3, 6, k=3) # k --> start index
# 기준 열에서 1을 시작점으로 찍는 옵션(3칸 건너 뛰고 시작한다.)
#array([[0., 0., 0., 1., 0., 0.],
#       [0., 0., 0., 0., 1., 0.],
#       [0., 0., 0., 0., 0., 1.]])


'''행렬 중 대각선 값만 뽑아내는 함수'''
t_matrix = np.arange(16).reshape(4,4)
np.diag(t_matrix)
# array([ 0,  5, 10, 15])

np.diag(t_matrix, k=1) # k옵션은 출력하는 열의 시작 위치를 나타낸다.
# array([ 1,  6, 11])

'''Random Sampling'''
# 정규분포 만들어주는 함수
np.random.uniform(0,1,12).reshape(4,3) # 균등분포
# np.random.uniform(최소값, 최대값, 데이터 개수)
#array([[0.39422635, 0.83574142, 0.10830835],
#       [0.33941921, 0.97021726, 0.91356626],
#       [0.9100399 , 0.8243246 , 0.85169551],
#       [0.48654262, 0.15473669, 0.8773488 ]])


np.random.normal(0,1,12).reshape(4,3) # 정규분포
#array([[ 0.29332096, -0.05565469, -0.04275069],
#       [ 0.15716903, -0.50732937, -0.12142198],
#       [ 1.42172159,  0.29410614, -2.32252451],
#       [ 1.44995498, -0.75131102,  0.35022273]])

'''axis'''
# - 모든 operation function 을 실행할 때, 기준이 되는 dimension 축
# 집계연산을 할 때 어떤 축을 기준으로 집계연산을 해달라는 의미
t_array = np.arange(1,13).reshape(3,4)
t_array
#array([[ 1,  2,  3,  4],
#       [ 5,  6,  7,  8],
#       [ 9, 10, 11, 12]])

t_array.sum(axis = 0), t_array.sum(axis=1)
#(array([15, 18, 21, 24]), array([10, 26, 42]))

t_array = np.arange(1, 13).reshape(3,4)
t_array
#array([[ 1,  2,  3,  4],
#       [ 5,  6,  7,  8],
#       [ 9, 10, 11, 12]])

t_array.mean(), t_array.mean(axis=0)
#(6.5, array([5., 6., 7., 8.]))

t_array.std(), t_array.std(axis=0)
#(3.452052529534663, array([3.26598632, 3.26598632, 3.26598632, 3.26598632]))

a = np.array([1,2,3])
b = np.array([4,5,6])
np.vstack((a,b))
# array([[1],
#        [2],
#        [3],
#        [4],
#        [5],
#        [6]])

a = np.array([ [1], [2], [3]])
b = np.array([ [4], [5], [6]])
np.hstack((a,b))
# array([[1, 4],
#        [2, 5],
#        [3, 6]])

# All, Any
# - All : array의 데이터가 전부 조건에 만족하면 True
# - Any : array의 데이터 중 하나라도 조건에 만족하면 True
a = np.arange(5)
a
#array([0, 1, 2, 3, 4])

np.all(a>3)
# False
np.all(a<5)
# True
np.any(a>3)
# True
np.any(a>5)
# False

'''all은 말 그대로 모든 조건 만족하면 true가 나오고, any는 하나라도 만족하면 true를 추출해내는 함수이다.'''

'''all은 말 그대로 모든 조건 만족하면 true가 나오고, any는 하나라도 만족하면 true를 추출해내는 함수이다.'''

a = np.array( [1,5,3], float)
b = np.array( [4,7,2], float)
a>b
#array([False, False,  True])
a==b
#array([False, False, False])
(a>b).any()
#True
(a>b).all()
#False

'''np.where'''
# - where(조건, True, False)

a = np.array( [2,3,1], float)
np.where(a > 1, 0, 3)
#array([0, 0, 3])

a = np.arange(3,10)
np.where(a>6) # True값의 index 반환
#(array([4, 5, 6], dtype=int64),)

a = np.array( [2, np.NaN, np.Inf], float)
np.isnan(a)
#array([False,  True, False])

np.isfinite(a) # 한정된 수인 경우 True
#array([ True, False, False])

# np.where은 우리가 생각하는 if문의 역할을 한다.
'''
isnan은 null값인 경우에만 True가 나온다
np.Nan은 numpy의 null 값을 입력하는 함수이고,
null값이니까 True

np.Inf 는 무한대

np.isinfite()는 한정된 수의 경우 True가 나오고
한정되지 않은 NaN이나 Inf의 경우에는 False가 나온다

'''
'''argmax, argmin'''
# - array 내 최대값 또는 최소값의 index 반환
a = np.array( [2,3,1,5,6,22,11])
np.argmax(a), np.argmin(a)
#(5, 2)

# -axis 기반의 반환
a = np.array( [[1,4,2,22], [45,32,4,7], [34,54,9,8]])
np.argmax(a,axis=0), np.argmin(a, axis=1)
#(array([1, 2, 2, 0], dtype=int64), array([0, 2, 3], dtype=int64))

'''argmax, argmin'''
# - array 내 최대값 또는 최소값의 index 반환
a = np.array( [2,3,1,5,6,22,11])
np.argmax(a), np.argmin(a)
#(5, 2)

# -axis 기반의 반환
a = np.array( [[1,4,2,22], [45,32,4,7], [34,54,9,8]])
np.argmax(a,axis=0), np.argmin(a, axis=1) # 0번(열 기준), 1번(행 기준)
#(array([1, 2, 2, 0], dtype=int64), array([0, 2, 3], dtype=int64))
# 첫 번째 열에서 가장 큰 값 45(1번 인덱스)-컬럼
# 첫 번째 행에서 가장 작은 값 1(0번 인덱스)-로우, 두번째 행에서 가장 작은값 4(2번 인덱스) - 로우

'''boolean index '''
# -numpy의 배열은 특정 조건에 따른 값을 배열 형태로 추출 가능
# -comparison operation 함수들도 모두 사용 가능
t_a = np.array( [3,5,8,0,7,4], float)
t_a > 4
#array([False,  True,  True, False,  True, False])

t_a[t_a>4] # 조건이 True인 index 의 요소값만 추출
#array([5., 8., 7.])

t_c = t_a <4
t_c
# array([ True, False, False,  True, False, False])

# -*- coding: utf-8 -*-
"""
Created on Fri Apr 24 17:19:42 2020

@author: admin
"""


import numpy as np

# concatenate
# - numpy array를 합치는 함수
a = np.array([[1,2,3]])
b = np.array([[4,5,6]])
np.concatenate((a,b), axis = 0)
#array([[1, 2, 3],
#       [4, 5, 6]])

a = np.array([[1,2], [3,4]])
b = np.array([[5,6]])
np.concatenate((a,b.T), axis=1 ) # a.T는 a의 역행렬 
#array([[1, 2, 5],
#       [3, 4, 6]])

# vstack 이랑 hstack은 같은 함수인데 axis로 결정된다.

'''Operations betwwn arrays'''
a = np.array( [[1,2,3], [4,5,6]], float)
a+a # matrix + matrix 연산
#array([[ 2.,  4.,  6.],
#       [ 8., 10., 12.]])

a-a # - 연산
#array([[0., 0., 0.],
#       [0., 0., 0.]])

a*a # matrix 내 요소들간 같은 위치에 있는 값들끼리 연산
#array([[ 1.,  4.,  9.],
#       [16., 25., 36.]])

'''이렇게 같은 index에 있는 것 끼리 더하고 빼고 곱해줘서 그 자리에 결과값을 넣어주는 연산 
= Element-wise Operation 이라고 한다.'''


# Dot product
# - matrix 의 기본 연산
# - dot 함수 사용

dot_a = np.arange(1, 7).reshape(2,3)
#array([[1, 2, 3],
#       [4, 5, 6]])

dot_b = np.arange(1, 7).reshape(3,2)
#array([[1, 2],
#       [3, 4],
#       [5, 6]])

dot_a.dot(dot_b)
# 행렬 연산식 적용 ( 2x3 행렬과 3x2 행렬의 곱)
#array([[22, 28],
#       [49, 64]])

t_matrix = np.array( [[1,2], [3,4]], float)
#array([[1., 2.],
#       [3., 4.]])

scalar = 2

t_matrix + scalar
#array([[3., 4.],
#       [5., 6.]])

저작자표시 비영리 변경금지 (새창열림)

':: IT > python' 카테고리의 다른 글

[flask]쿠키와 세션 다루기,리다이렉션과 에러처리, 파일 업로드 하기 (0)	2020.05.07
Flask 입문, 파이썬과 HTML, 폼으로 데이터 전송받기 (0)	2020.05.06
[파이썬] 전처리 20200423 (0)	2020.04.27
[파이썬]수집된 데이터 형식 확인 및 로컬 전처리 (0)	2020.04.23
[파이썬] 분석데이터 수집 및 분석(전처리) (0)	2020.04.23

:: GO치의 에브리데이 일기장::

[파이썬] 데이터 전처리 20200424

':: IT > python' 카테고리의 다른 글

+ Recent posts

티스토리툴바