通用文件处理:
import numpy as np
//文件名和文件中每行的分隔符
def loadDataSet(fileName,dotSplit):
numFeat = len(open(fileName).readline().split(dotSplit))
dataMat = []; labelMat = []
fr = open(fileName)
//该数据集默认是最后一列是因变量
for line in fr.readlines():
lineArr=[]
curline = line.split(dotSplit)
for i in range(0,numFeat-1):
lineArr.append(float(curline[i]))
dataMat.append(lineArr)
labelMat.append(float(curline[numFeat-1]))
xMat = np.mat(dataMat)
yMat = np.mat(labelMat).T
return xMat,yMat
这里是处理岭回归的实现:
import numpy as np
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + np.eye(np.shape(xMat)[1])*lam
print np.shape(xMat)[0]
if np.linalg.det(denom) == 0.0:
print "wrong"
return
ws = denom.I*(xMat.T*yMat)
return ws
def normalizing(xMat,yMat):
yMean = np.mean(yMat,0)
y = yMat-yMean
xMeans = np.mean(xMat,0)
xVar = np.var(xMat,0)
x = (xMat-xMeans)/xVar
return x,y
def ridgeTest(xM,yM):
xMat,yMat = normalizing(xM,yM)
numTestPts = 30
wMat = np.zeros((numTestPts,np.shape(xMat)[1]))
print wMat
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,np.exp(i-10))
wMat[i,:] = ws.T
return wMat
向前逐步回归:
import numpy as np
def rssError(yArr,yHatArr):
return ((yArr-yHatArr)**2).sum()
def stageWise(xM,yM,eps=0.01,numIt=100):
m,n = np.shape(xM)
returnMat = np.zeros((numIt,n))
ws = np.zeros((n,1));wsTest = ws.copy();wsMax = ws.copy()
lowestError = 0
for i in range(numIt):
print ws.T
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xM*wsTest
rssE = rssError(yM.A,yTest.A)
if i == 0:
lowestError = rssE
if rssE < lowestError:
lowestError = rssE
print lowestError
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:] = ws.T
return returnMat