2015-06-29 42 views
3

numpy.linalg.lstsq(a,b)函數接受大小爲nx2的數組a和作爲因變量的1維數組b2d陣列上的最小二乘迴歸

我該如何去做一個最小二乘迴歸,其中的數據點顯示爲從圖像文件生成的二維數組?該陣列看起來是這樣的:

[[0, 0, 0, 0, e] 
[0, 0, c, d, 0] 
[b, a, f, 0, 0]] 

其中a, b, c, d, e, f是正整數。

我想適合這些要點。我可以使用np.linalg.lstsq(如果是這樣,如何),還是有什麼可能更有意義(如果是,如何)?

非常感謝。

+0

您可以發佈您2D陣列和你的線性模型? – Dalek

回答

1

曾經有一段時間我看到了類似的Python程序從

# Prac 2 for Monte Carlo methods in a nutshell 
# Richard Chopping, ANU RSES and Geoscience Australia, October 2012 
# Useage 
# python prac_q2.py [number of bootstrap runs] 
# e.g. python prac_q2.py 10000 
# would execute this and perform 10 000 bootstrap runs. 
# Default is 100 runs. 

# sys cause I need to access the arguments the script was called with 
import sys 
# math cause it's handy for scalar maths 
import math 
# time cause I want to benchmark how long things take 
import time 
# numpy cause it gives us awesome array/matrix manipulation stuff 
import numpy 
# scipy just in case 
import scipy 
# scipy.stats to make life simpler statistcally speaking 
import scipy.stats as stats 

def main(): 
    print "Prac 2 solution: no graphs" 
    true_model = numpy.array([17.0, 10.0, 1.96]) 

    # Here's a nifty way to write out numpy arrays. 
    # Unlike the data table in the prac handouts, I've got time first 
    # and height second. 
    # You can mix up the order but you need to change a lot of calculations 
    # to deal with this change. 
    data = numpy.array([[1.0, 26.94], 
         [2.0, 33.45], 
         [3.0, 40.72], 
         [4.0, 42.32], 
         [5.0, 44.30], 
         [6.0, 47.19], 
         [7.0, 43.33], 
         [8.0, 40.13]]) 
    # Perform the least squares regression to find the best fit solution 
    best_fit = regression(data) 
    # Nifty way to get out elements from an array 
    m1,m2,m3 = best_fit 
    print "Best fit solution:" 
    print "m1 is", m1, "and m2 is", m2, "and m3 is", m3 

    # Calculate residuals from the best fit solution 
    best_fit_resid = residuals(data, best_fit) 

    print "The residuals from the best fit solution are:" 
    print best_fit_resid 
    print "" 

    # Bootstrap part 
    # -------------- 
    # Number of bootstraps to run. 100 is a minimum and our default number. 
    num_booties = 100 
    # If we have an argument to the python script, use this as the 
    # number of bootstrap runs 
    if len(sys.argv) > 1: 
     num_booties = int(sys.argv[1]) 

    # preallocate an array to store the results. 
    ensemble = numpy.zeros((num_booties, 3)) 

    print "Starting up the bootstrap routine" 

    # How to do timing within a Python script - here I start a stopwatch running 
    start_time = time.clock() 
    for index in range(num_booties): 
     # Print every 10 % so we know where we're up to in long runs 
     if print_progress(index, num_booties): 
      percent = (float(index)/float(num_booties)) * 100.0 
      print "Have completed", percent, "percent" 

     # For each iteration of the bootstrap algorithm, 
     # first calculate mixed up residuals... 
     resamp_resid = resamp_with_replace(best_fit_resid) 
     # ... then generate new data... 
     new_data = calc_new_data(data, best_fit, resamp_resid) 
     # ... then perform another regression to generate a new set of m1, m2, m3 
     bootstrap_model = regression(new_data) 
     ensemble[index] = (bootstrap_model[0], bootstrap_model[1], bootstrap_model[2]) 
     # Done with the loop 
    # Calculate the time the run took - what's the current time, minus when we started. 
    loop_time = time.clock() - start_time 

    print "" 

    print "Ensemble calculated based on", num_booties, "bootstrap runs." 
    print "Bootstrap runs took", loop_time, "seconds." 
    print "" 

    # Stats on the ensemble time 
    # -------------------------- 
    B = num_booties 

    # Mean is pretty simple, 1.0/B to force it to use floating points 
    # This gives us an array of the means of the 3 model parameters 
    mean = 1.0/B * numpy.sum(ensemble, axis=0) 
    print "Mean is ([m1 m2 m3]):", mean 

    # Variance 
    var2 = 1.0/B * numpy.sum(((ensemble - mean)**2), axis=0) 
    print "Variance squared is ([m1 m2 m3]):", var2 
    # Bias 
    bias = mean - best_fit 
    print "Bias is ([m1 m2 m3]):", bias 
    bias_corr = best_fit - bias 
    print "Bias corrected solution is ([m1 m2 m3]):", bias_corr 
    print "The original solution was ([m1 m2 m3]):", best_fit 
    print "And the true solution is ([m1 m2 m3]):", true_model 

    print "" 

    # Confidence intervals 
    # --------------------- 
    # Sort column 1 to calculate confidence intervals 
    # Sorting in numpy sucks. 
    # Need to declare what the fields are (so it knows how to sort it) 
    # f8 => numpy's floating point number 
    # Then need to delcare what we sort it on 
    # Here we sort on the first column, then the second, then the third. 
    # f0,f1,f2 field 0, then field 1, then field 2. 
    # Then we make sure we sort it by column (axis = 0) 
    # Then we take a view of that data as a float64 so it works properly 
    sorted_m1 = numpy.sort(ensemble.view('f8,f8,f8'), order=['f0','f1','f2'], axis=0).view(numpy.float64) 

    # stats is my name for scipy.stats 
    # This has a wonderful function that calculates percentiles, including performing interpolation 
    # (important for low numbers of bootstrap runs) 
    m1_perc0p5 = stats.scoreatpercentile(sorted_m1,0.5)[0] 
    m1_perc2p5 = stats.scoreatpercentile(sorted_m1,2.5)[0] 
    m1_perc16 = stats.scoreatpercentile(sorted_m1,16)[0] 
    m1_perc84 = stats.scoreatpercentile(sorted_m1,84)[0] 
    m1_perc97p5 = stats.scoreatpercentile(sorted_m1,97.5)[0] 
    m1_perc99p5 = stats.scoreatpercentile(sorted_m1,99.5)[0] 
    print "m1 68% confidence interval is from", m1_perc16, "to", m1_perc84 
    print "m1 95% confidence interval is from", m1_perc2p5, "to", m1_perc97p5 
    print "m1 99% confidence interval is from", m1_perc0p5, "to", m1_perc99p5 
    print "" 

    # Now column 2, sort it... 
    sorted_m2 = numpy.sort(ensemble.view('f8,f8,f8'), order=['f1','f0','f2'], axis=0).view(numpy.float64) 
    # ... and do stats. 
    m2_perc0p5 = stats.scoreatpercentile(sorted_m2,0.5)[1] 
    m2_perc2p5 = stats.scoreatpercentile(sorted_m2,2.5)[1] 
    m2_perc16 = stats.scoreatpercentile(sorted_m2,16)[1] 
    m2_perc84 = stats.scoreatpercentile(sorted_m2,84)[1] 
    m2_perc97p5 = stats.scoreatpercentile(sorted_m2,97.5)[1] 
    m2_perc99p5 = stats.scoreatpercentile(sorted_m2,99.5)[1] 
    print "m2 68% confidence interval is from", m2_perc16, "to", m2_perc84 
    print "m2 95% confidence interval is from", m2_perc2p5, "to", m2_perc97p5 
    print "m2 99% confidence interval is from", m2_perc0p5, "to", m2_perc99p5 
    print "" 

    # and finally column 3, again, sort it.. 
    sorted_m3 = numpy.sort(ensemble.view('f8,f8,f8'), order=['f2','f1','f0'], axis=0).view(numpy.float64) 
    # ... and do stats. 
    m3_perc0p5 = stats.scoreatpercentile(sorted_m3,0.5)[1] 
    m3_perc2p5 = stats.scoreatpercentile(sorted_m3,2.5)[1] 
    m3_perc16 = stats.scoreatpercentile(sorted_m3,16)[1] 
    m3_perc84 = stats.scoreatpercentile(sorted_m3,84)[1] 
    m3_perc97p5 = stats.scoreatpercentile(sorted_m3,97.5)[1] 
    m3_perc99p5 = stats.scoreatpercentile(sorted_m3,99.5)[1] 
    print "m3 68% confidence interval is from", m3_perc16, "to", m3_perc84 
    print "m3 95% confidence interval is from", m3_perc2p5, "to", m3_perc97p5 
    print "m3 99% confidence interval is from", m3_perc0p5, "to", m3_perc99p5 
    print "" 
    # End of the main function 


# 
# 
# Helper functions go down here 
# 
# 


# regression 
# This takes a 2D numpy array and performs a least-squares regression 
# using the formula on the practical sheet, page 3 
# Stored in the top are the real values 
# Returns an array of m1, m2 and m3. 
def regression(data): 
    # While testing, just return the real values 
    # real_values = numpy.array([17.0, 10.0, 1.96]) 

    # Creating the G matrix 
    # --------------------- 
    # Because I'm using numpy arrays here, we need 
    # to learn some notation. 
    # data[:,0] is the FIRST column 
    # Length of this = number of time samples in data 
    N = len(data[:,0]) 

    # numpy.sum adds up all data in a row or column. 
    # Axis = 0 implies add up each column. [0] at end 
    # returns the sum of the first column 
    # This is the sum of Ti for i = 1..N 
    sum_Ti = numpy.sum(data, axis=0)[0] 

    # numpy.power takes each element of an array and raises them to a given power 
    # In this one call we also take the sum of the columns (as above) after they have 
    # been squared, and then just take the t column 
    sum_Ti2 = numpy.sum(numpy.power(data, 2), axis=0)[0] 

    # Now we need to get the cube of Ti, then sum that result 
    sum_Ti3 = numpy.sum(numpy.power(data, 3), axis=0)[0] 

    # Finally we need the quartic of Ti, then sum that result 
    sum_Ti4 = numpy.sum(numpy.power(data, 4), axis=0)[0] 

    # Now we can construct the G matrix 
    G = numpy.array([[N, sum_Ti, -0.5 * sum_Ti2], 
         [sum_Ti, sum_Ti2, -0.5 * sum_Ti3], 
         [-0.5 * sum_Ti2, -0.5 * sum_Ti3, 0.25 * sum_Ti4]]) 
    # We also need to take the inverse of the G matrix 
    G_inv = numpy.linalg.inv(G) 


    # Creating the d matrix 
    # --------------------- 
    # Hello numpy.sum, my old friend... 
    sum_Yi = numpy.sum(data, axis=0)[1] 

    # numpy.prod multiplies the values in an array. 
    # We need to do the products along axis 1 (i.e. row by row) 
    # Then sum all the elements 
    sum_TiYi = numpy.sum(numpy.prod(data, axis=1)) 

    # The final element we need is a bit tricky. 
    # We need the product as above 
    TiYi = numpy.prod(data, axis=1) 
    # Then we get tricky. * works how we need it here, 
    # remember that the Ti column is referenced by data[:,0] as above 
    Ti2Yi = TiYi * data[:,0] 
    # Then we sum 
    sum_Ti2Yi = numpy.sum(Ti2Yi) 

    #With all the elements, we make the d matrix 
    d = numpy.array([sum_Yi, 
        sum_TiYi, 
        -0.5 * sum_Ti2Yi]) 

    # Do the linear algebra stuff 
    # To multiple numpy arrays in a matrix style, 
    # we need to use numpy.dot() 
    # Not the most useful notation, but there you go. 
    # To help out the Matlab users: http://www.scipy.org/NumPy_for_Matlab_Users 
    result = G_inv.dot(d) 

    #Return this result 
    return result 

# residuals: 
# Takes in a data array, and an array of best fit paramers 
# calculates the difference between the observed and predicted data 
# and returns an array 
def residuals(data, best_fit): 
    # Extract ti from the data array 
    ti = data[:,0] 
    # We also need an array of the square of ti 
    ti2 = numpy.power(ti, 2) 

    # Extract yi 
    yi = data[:,1] 

    # Calculate residual (data minus predicted) 
    result = yi - best_fit[0] - (best_fit[1] * ti) + (0.5 * best_fit[2] * ti2) 

    return result 

# resamp_with_replace: 
# Perform a dataset resampling with replacement on parameter set. 
# Uses numpy.random to generate the random numbers to pick the indices to look up. 
# So for item 0, ... N, we look up a random index from the set and put that in 
# our resampled data. 
def resamp_with_replace(set): 
    # How many things do we need to do this for? 
    N = len(set) 

    # Preallocate our result array 
    result = numpy.zeros(N) 

    # Generate N random integers between 0 and N-1 
    indices = numpy.random.randint(0, N - 1, N) 

    # For i from the set 0...N-1 (that's what the range() command gives us), 
    # our result for that i is given by the index we randomly generated above 
    for i in range(N): 
     result[i] = set[indices[i]] 

    return result 

# calc_new_data: 
# Given a set of resampled residuals, use the model parameters to derive 
# new data. This is used for bootstrapping the residuals. 
# true_data is a numpy array of rows of ti, yi. We only need the ti column though. 
# model is an array of three parameters, corresponding to m1, m2, m3. 
# residuals are an array of our resudials 
def calc_new_data(true_data, model, residuals): 
    # Extract the time information from the new data array 
    ti = true_data[:,0] 

    # Calculate new data using array maths 
    # This goes through and does the sums etc for each element of the array 
    # Nice and compact way to represent it. 
    y_new = residuals + model[0] + (model[1] * ti) - (0.5 * model[2] * ti**2) 

    # Our result needs to be an array of ti, y_new, so we need to combine them using 
    # the numpy.column_stack routine 
    result = numpy.column_stack((ti, y_new)) 

    # Return this combined array 
    return result 

# print_progress: 
# Just a quick thing that returns true if we want to print for this index 
# and false otherwise 
def print_progress(index, total): 
    index = float(index) 
    total = float(total) 

    result = False 

    # Floating point maths is irritating 
    # We want to print at the start, every 10%, and at the end. 
    # This works up to index = 100,000 
    # Would also be lovely if Python had a switch statement 
    if (((index/total) * 100) <= 0.00001): 
     result = True 
    elif (((index/total) * 100) >= 9.99999) and (((index/total) * 100) <= 10.00001): 
     result = True 
    elif (((index/total) * 100) >= 19.99999) and (((index/total) * 100) <= 20.00001): 
     result = True 
    elif (((index/total) * 100) >= 29.99999) and (((index/total) * 100) <= 30.00001): 
     result = True 
    elif (((index/total) * 100) >= 39.99999) and (((index/total) * 100) <= 40.00001): 
     result = True 
    elif (((index/total) * 100) >= 49.99999) and (((index/total) * 100) <= 50.00001): 
     result = True 
    elif (((index/total) * 100) >= 59.99999) and (((index/total) * 100) <= 60.00001): 
     result = True 
    elif (((index/total) * 100) >= 69.99999) and (((index/total) * 100) <= 70.00001): 
     result = True 
    elif (((index/total) * 100) >= 79.99999) and (((index/total) * 100) <= 80.00001): 
     result = True 
    elif (((index/total) * 100) >= 89.99999) and (((index/total) * 100) <= 90.00001): 
     result = True 
    elif ((((index+1)/total) * 100) > 99.99999): 
     result = True 
    else: 
     result = False 

    return result 

# 
# 
# End of helper functions 
# 
# 

# So we can easily execute our script 
if __name__ == "__main__": 
    main() 

我想你可以看看,這裏是link完成信息

2

使用sklearn而不是numpy的(sklearn從numpy的派生但是對於許多這種計算)更好:

from sklearn import linear_model 
clf = linear_model.LinearRegression() 
clf.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) 

線性迴歸(copy_X = T芸香,fit_intercept =真,n_jobs = 1, 正常化=假)

clf.coef_ 

陣列([0.5,0.5])