π λ°μ΄ν° λΆμ/04. Data Analysis
[ν΅κ³μ λͺ¨λΈλ§] μ ννκ·, λ‘μ§μ€ν±νκ·
xod22
2022. 3. 18. 00:20
728x90
μ ννκ· κ°μ
1. ν¨ν€μ§ μν¬νΈ
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import patsy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
2. λ°μ΄ν° μμ±
y=np.array([1,2,3,4,5])
x1=np.array([6,7,8,9,10])
x2=np.array([11,12,13,14,15])
data={"y":y, "x1":x1, "x2":x2}
3. λͺ¨λΈ μμ±
y, X=patsy.dmatrices("y~1+x1+x2+x1:x2", data, return_type="dataframe")
model=sm.OLS(y,X)
result=model.fit()
result.params
~":" λμ μ "*"μ μ¨μ€~
y,X=patsy.dmatrices("y~x1*x2", data, return_type="dataframe")
model2=sm.OLS(y,X)
result2=model2.fit()
result2.params
μμ μμκ³Ό κ°μ κ²°κ³Όλ₯Ό λνλ..!
y,X=patsy.dmatrices("y~x1+x2", data, return_type="dataframe")
model3=sm.OLS(y,X)
result3=model3.fit()
result3.params
~λ‘κ·Έ/μΌκ°ν¨μλ νν κ°λ₯~
y,X=patsy.dmatrices("y~np.log(x1)+np.cos(x2)+np.sin(x1+x2)", data, return_type="dataframe")
model4=sm.OLS(y,X)
result4=model4.fit()
result4.params
λ²μ£Ό μμ±
: Patsy ν¨ν€μ§λ λ²μ£Όν λ³μλ₯Ό μμ±ν μ μλ€.
μλμΌλ‘ μ«μκ°μ C(x1)μ΄λ κ² μ§μ νλ©΄ μΉ΄ν κ³ λ¦¬νλ¨..!
y,X=patsy.dmatrices("y~-1+C(x1)", data=data, return_type="dataframe")
print(X)
1. μ ννκ· λΆμ
# μμ΄μ€ν¬λ¦Ό λ°μ΄ν° : μμ΄μ€ν¬λ¦Ό μλΉλ, κ³ κ°μ μμ , μμ΄μ€ν¬λ¦Ό κ°κ²© λ° ν루μ μ¨λ
dataset=sm.datasets.get_rdataset("Icecream", "Ecdat")
model=smf.ols("cons~1+income+price+temp", data=dataset.data)
result=model.fit()
print(result.summary())
~ incomeμ μ μΈνκ³ μλΉλ~κ°κ²©, μ¨λμ νκ·~
#μλΉλ~κ°κ²©, μ¨λμ νκ·
model=smf.ols("cons~1+price+temp", data=dataset.data)
result=model.fit()
print(result.summary())
2. μ΄μ°νκ·λΆμ : λ‘μ§μ€ν±νκ·λΆμ
~iris λ°μ΄ν° λΆλ¬μ€κΈ°~
df = sm.datasets.get_rdataset("iris").data
df_subset=df[df.Species.isin(["versicolor","virginica"])].copy()
df_subset.Species = df_subset.Species.map({"versicolor":1,"virginica":0})
df_subset.rename(columns={"Sepal.Length": "Sepal_Length","Sepal.Width": "Sepal_Width","Petal.Length": "Petal_Length","Petal.Width": "Petal_Width"}, inplace=True)
~λ‘μ§μ€ν±νκ· λͺ¨λΈ μμ±~
model = smf.logit("Species ~ Petal_Length + Petal_Width", data=df_subset)
result = model.fit()
print(result.summary())
~plot~
#scatter plot
params=result.params
alpha0 = -params['Intercept']/params['Petal_Width']
alpha1 = -params['Petal_Length']/params['Petal_Width']
_x=np.array([3.0, 7.0])
fig, ax = plt.subplots(1,1, figsize=(8,4))
ax.plot(df_subset[df_subset.Species==0].Petal_Length.values, df_subset[df_subset.Species==0].Petal_Width.values,'s', label='virginica')
ax.plot(df_subset[df_subset.Species==1].Petal_Length.values, df_subset[df_subset.Species==1].Petal_Width.values,'s', label='versicolor')
ax.plot(_x,alpha0+alpha1 * _x)
ax.set_xlabel('Petal length')
ax.set_ylabel('Petal width')
ax.legend()
=> μ λΆλ₯νκ³ μμμ νμΈ..!
728x90