1.原始交叉验证
# Import the linear regression classfrom sklearn.linear_model import LinearRegression# Sklearn also has a helper that makes it easy to do cross validationfrom sklearn.cross_validation import KFoldimport numpy as np# The columns we'll use to predict the targetpredictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]# Initialize our algorithm classalg = LinearRegression()# Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test.# We set random_state to ensure we get the same splits every time we run this.kf = KFold(titanic.shape[0], n_folds=3, random_state=1)predictions = []for train, test in kf: # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds. train_predictors = (titanic[predictors].iloc[train,:]) # The target we're using to train the algorithm. train_target = titanic["Survived"].iloc[train] # Training the algorithm using the predictors and target. alg.fit(train_predictors, train_target) # We can now make predictions on the test fold test_predictions = alg.predict(titanic[predictors].iloc[test,:]) predictions.append(test_predictions)# The predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis.predictions = np.concatenate(predictions, axis=0)# Map predictions to outcomes (only possible outcomes are 1 and 0)predictions[predictions > .5] = 1predictions[predictions <=.5] = 0accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)print accuracy
0.783389450056
2.cross_validation交叉验证
from sklearn import cross_validationfrom sklearn.linear_model import LogisticRegression# Initialize our algorithmalg = LogisticRegression(random_state=1)# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)# Take the mean of the scores (because we have one for each fold)print(scores.mean())
0.787878787879
from sklearn import cross_validationfrom sklearn.ensemble import RandomForestClassifierpredictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]# Initialize our algorithm with the default paramters# n_estimators is the number of trees we want to make# min_samples_split is the minimum number of rows we need to make a split# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)# Take the mean of the scores (because we have one for each fold)print(scores.mean())
0.785634118967 3. 多模型投票分类
predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"]algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors], [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]]full_predictions = []for alg, predictors in algorithms: # Fit the algorithm using the full training data. alg.fit(titanic[predictors], titanic["Survived"]) # Predict using the test dataset. We have to convert all the columns to floats to avoid an error. predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1] full_predictions.append(predictions)# The gradient boosting classifier generates better predictions, so we weight it higher.predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4predictions
交叉验证中混合模型分类
from sklearn.ensemble import GradientBoostingClassifierimport numpy as np# The algorithms we want to ensemble.# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]], [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]]# Initialize the cross validation foldskf = KFold(titanic.shape[0], n_folds=3, random_state=1)predictions = []for train, test in kf: train_target = titanic["Survived"].iloc[train] full_test_predictions = [] # Make predictions for each algorithm on each fold for alg, predictors in algorithms: # Fit the algorithm on the training data. alg.fit(titanic[predictors].iloc[train,:], train_target) # Select and predict on the test fold. # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error. test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1] full_test_predictions.append(test_predictions) # Use a simple ensembling scheme -- just average the predictions to get the final classification. test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction. test_predictions[test_predictions <= .5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions)# Put all the predictions together into one array.predictions = np.concatenate(predictions, axis=0)# Compute accuracy by comparing to the training data.accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)print(accuracy)