G4: Edinaldo de Alencar / Igor Freire / Ramon Araújo / Ricardo Ribeiro
26 de setembro de 2014
# Mudar para o diretório onde está a base de dados
setwd(paste("/Users/igorfreire/Documents/Mestrado/UFPA",
"/Mineração de Dados/",
"Trabalho 1 - Pré-processamento NA's", sep=""))
# Ler o .csv
dataset <- read.csv(file = "Sample_Store_Dataset.csv",
header = TRUE,
sep = ";", # caractere separador de entradas no .csv
dec = ",", # caractere separador de casas decimais
na.strings = c("Not Specified", "?", ""))
sep =
)dec =
)na.strings =
).Obs: Na base fornecida os valores ausentes aparecem como "Not Specified"
e "?"
summary(dataset)
## Row.ID Order.Priority Discount Unit.Price
## Min. : 2 Critical :1800 Min. :0.0000 Min. : 1
## 1st Qu.:19330 Critical : 1 1st Qu.:0.0200 1st Qu.: 6
## Median :21686 High :1969 Median :0.0500 Median : 21
## Mean :20241 Low :1926 Mean :0.0496 Mean : 88
## 3rd Qu.:24043 Medium :1844 3rd Qu.:0.0800 3rd Qu.: 86
## Max. :26399 NA's :1886 Max. :0.2500 Max. :6783
##
## Shipping.Cost Customer.ID Customer.Name
## Min. : 0.49 Min. : 2 Louis Parrish : 27
## 1st Qu.: 3.19 1st Qu.: 898 Jenny Gold : 26
## Median : 6.05 Median :1750 Leigh Burnette Hurley: 22
## Mean : 12.80 Mean :1738 Sean N Boyer : 22
## 3rd Qu.: 13.99 3rd Qu.:2579 Andrew Gonzalez : 21
## Max. :164.73 Max. :3403 Glen Caldwell : 21
## (Other) :9287
## Ship.Mode Customer.Segment Product.Category
## Delivery Truck:1283 Consumer :1894 Furniture :1933
## Express Air :1107 Corporate :3375 Office Supplies:5181
## Regular Air :7036 Home Office :2316 Technology :2312
## Small Business:1841
##
##
##
## Product.Sub.Category Product.Container
## Paper :1374 Jumbo Box : 590
## Binders and Binder Accessories:1024 Jumbo Drum: 703
## Telephones and Communication : 976 Large Box : 457
## Office Furnishings : 883 Medium Box: 410
## Computer Peripherals : 836 Small Box :4887
## (Other) :4318 Small Pack:1067
## NA's : 15 Wrap Bag :1312
## Product.Name
## Bevis 36 x 72 Conference Tables : 26
## BoxOffice By Design Rectangular and Half-Moon Meeting Room Tables : 24
## Global High-Back Leather Tilter, Burgundy : 23
## Master Giant Foot<a9> Doorstop, Safety Yellow : 22
## Office Star - Mid Back Dual function Ergonomic High Back Chair with 2-Way Adjustable Arms: 22
## StarTAC 7760 : 22
## (Other) :9287
## Product.Base.Margin Region.ID State.or.Province
## Min. :0.35 Central:2609 California:1021
## 1st Qu.:0.38 East :2030 Texas : 646
## Median :0.52 West :1934 Illinois : 584
## Mean :0.51 South :1830 New York : 574
## 3rd Qu.:0.59 4 : 340 Florida : 522
## Max. :0.85 1 : 308 Ohio : 396
## NA's :72 (Other): 375 (Other) :5683
## City Postal.Code Order.Date Ship.Date
## New York City: 199 Min. : 1001 Min. :40179 Min. :40180
## Los Angeles : 196 1st Qu.:29406 1st Qu.:40609 1st Qu.:40611
## Seattle : 89 Median :52302 Median :41007 Median :41008
## Chicago : 88 Mean :52446 Mean :40974 Mean :40976
## Boston : 78 3rd Qu.:78516 3rd Qu.:41359 3rd Qu.:41361
## Washington : 68 Max. :99362 Max. :41639 Max. :41656
## (Other) :8708
## Profit Quantity.ordered.new Sales Order.ID
## Min. :-16477 Min. : 1.0 Min. : 1 Min. : 6
## 1st Qu.: -74 1st Qu.: 5.0 1st Qu.: 61 1st Qu.:86737
## Median : 3 Median : 10.0 Median : 203 Median :88344
## Mean : 139 Mean : 13.8 Mean : 950 Mean :82318
## 3rd Qu.: 140 3rd Qu.: 17.0 3rd Qu.: 776 3rd Qu.:89988
## Max. : 16332 Max. :170.0 Max. :100119 Max. :91591
##
As linhas a seguir apresentam um “sumário” dos atributos que possuem VA’s e salvam os índices das instâncias contendo VA’s. Estes índices serão usados posteriormente.
Nota: NA’s = VA’s = Valores Ausentes
summary(dataset$Order.Priority)
## Critical Critical High Low Medium NA's
## 1800 1 1969 1926 1844 1886
# Salvar índices das instâncias contendo VA's:
order_priority_na_index <- which(is.na(dataset$Order.Priority))
summary(dataset$Product.Base.Margin)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.35 0.38 0.52 0.51 0.59 0.85 72
# Salvar índices das instâncias contendo VA's:
product_base_margin_na_index <- which(is.na(dataset$Product.Base.Margin))
summary(dataset$Product.Sub.Category)
## Appliances Binders and Binder Accessories
## 494 1024
## Bookcases Chairs & Chairmats
## 207 442
## Computer Peripherals Copiers and Fax
## 836 107
## Envelopes Labels
## 272 334
## Office Furnishings Office Machines
## 883 379
## Paper Pens & Art Supplies
## 1374 710
## Rubber Bands Scissors Rulers and Trimmers
## 195 35
## Scissors, Rulers and Trimmers Storage & Organization
## 130 605
## Tables Telephones and Communication
## 408 976
## NA's
## 15
# Salvar índices das instâncias contendo VA's:
product_sub_category_nas <- which(is.na(dataset$Product.Sub.Category))
Métodos Testados:
library(DMwR)
mmImput_dataset <- centralImputation(dataset)
# NA's "imputados"
table(mmImput_dataset$Order.Priority[order_priority_na_index])
##
## Critical Critical High Low Medium
## 0 0 1886 0 0
# Comparar com a moda:
mode(dataset$Order.Priority)
## [1] High
## Levels: Critical Critical High Low Medium
# Estatisticas originais
summary(mmImput_dataset$Product.Base.Margin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.350 0.380 0.520 0.512 0.590 0.850
# NA's "imputados"
mmImput_dataset$Product.Base.Margin[product_base_margin_na_index]
## [1] 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## [15] 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## [29] 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## [43] 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## [57] 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## [71] 0.52 0.52
# NA's "imputados"
table(mmImput_dataset$Product.Sub.Category[product_sub_category_nas])
##
## Appliances Binders and Binder Accessories
## 0 0
## Bookcases Chairs & Chairmats
## 0 0
## Computer Peripherals Copiers and Fax
## 0 0
## Envelopes Labels
## 0 0
## Office Furnishings Office Machines
## 0 0
## Paper Pens & Art Supplies
## 15 0
## Rubber Bands Scissors Rulers and Trimmers
## 0 0
## Scissors, Rulers and Trimmers Storage & Organization
## 0 0
## Tables Telephones and Communication
## 0 0
# Comparar com a moda:
mode(dataset$Product.Sub.Category)
## [1] Paper
## 18 Levels: Appliances Binders and Binder Accessories ... Telephones and Communication
preprocessed_dataset <- knnImputation(dataset, k = 3)
k
determina o número de vizinhos
# NA's "imputados"
table(preprocessed_dataset$Order.Priority[order_priority_na_index])
##
## Critical Critical High Low Medium
## 895 801 190 0 0
# Comparar com a moda:
mode(dataset$Order.Priority)
## [1] High
## Levels: Critical Critical High Low Medium
# Estatisticas originais
summary(dataset$Product.Base.Margin)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.35 0.38 0.52 0.51 0.59 0.85 72
# NA's "imputados"
preprocessed_dataset$Product.Base.Margin[product_base_margin_na_index]
## [1] 0.4740 0.5398 0.3669 0.4371 0.6138 0.3693 0.7320 0.7048 0.4754 0.4089
## [11] 0.7394 0.5105 0.6456 0.5633 0.3668 0.7400 0.5781 0.6308 0.5759 0.5433
## [21] 0.4329 0.5723 0.6228 0.4265 0.8041 0.5276 0.4915 0.5219 0.6905 0.4303
## [31] 0.3811 0.3698 0.4659 0.4627 0.7094 0.4845 0.3621 0.5377 0.4914 0.6627
## [41] 0.7478 0.7323 0.5738 0.7183 0.3805 0.6143 0.6872 0.6795 0.7117 0.5919
## [51] 0.8365 0.6340 0.7014 0.5723 0.8073 0.5959 0.5851 0.3974 0.3816 0.6553
## [61] 0.4223 0.4153 0.3730 0.4344 0.3798 0.7323 0.5296 0.4164 0.5731 0.4528
## [71] 0.6561 0.4888
# NA's "imputados"
table(preprocessed_dataset$Product.Sub.Category[product_sub_category_nas])
##
## Appliances Binders and Binder Accessories
## 6 5
## Bookcases Chairs & Chairmats
## 4 0
## Computer Peripherals Copiers and Fax
## 0 0
## Envelopes Labels
## 0 0
## Office Furnishings Office Machines
## 0 0
## Paper Pens & Art Supplies
## 0 0
## Rubber Bands Scissors Rulers and Trimmers
## 0 0
## Scissors, Rulers and Trimmers Storage & Organization
## 0 0
## Tables Telephones and Communication
## 0 0
# Comparar com a moda:
mode(dataset$Product.Sub.Category)
## [1] Paper
## 18 Levels: Appliances Binders and Binder Accessories ... Telephones and Communication
Objetivo: Comparar a acurácia de modelos treinados pelo conjunto de dados com VA’s e pelo conjunto de dados pré-processado (cujos VA’s foram tratados).
Estratégia:
Product.Category
) com base nas variáveis “Order.Priority
”, “Unit.Price
”, “Shipping.Cost
”, “Ship.Mode
” e “Product.Base.Margin
”.library(caret)
# Somente alguns atributos (colunas) serão utilizados para o teste:
selectedCols <- c("Product.Category",
"Order.Priority",
"Unit.Price",
"Shipping.Cost",
"Ship.Mode",
"Product.Base.Margin")
# Gerar indices para particionar o conjunto de dados entre "treinamento" e "teste"
inTraining <- createDataPartition(dataset$Product.Category, p = 0.7, list = FALSE)
# Conjuntos de "treinamento" e "teste"
training <- dataset[inTraining, match(selectedCols, colnames(dataset))]
testing <- dataset[-inTraining, match(selectedCols, colnames(dataset))]
# Treinamento e teste para o conjunto pré-processado (conjunto cujos VA's foram tratados)
training_preProcessed <- preprocessed_dataset[inTraining, match(selectedCols, colnames(dataset))]
testing_preProcessed <- preprocessed_dataset[-inTraining, match(selectedCols, colnames(dataset))]
# Modelo para o conjunto de dados original
rf_model <- train(Product.Category ~ ., method = "rf", data = training)
# Modelo para o conjunto de dados pré-processado
rf_model_preProcessed <- train(Product.Category ~ ., method = "rf", data = training_preProcessed)
predictedValues <- predict(rf_model, testing)
predictedValues_preProcessed <- predict(rf_model_preProcessed, testing)
# Matrizes de confusão:
confusionMtx <- confusionMatrix(na.omit(testing)$Product.Category, predictedValues)
confusionMtx_preProcessed <- confusionMatrix(na.omit(testing)$Product.Category, predictedValues_preProcessed)
# Acurácias:
confusionMtx$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9920 0.9867 0.9874 0.9953 0.5483
## AccuracyPValue McnemarPValue
## 0.0000 NaN
confusionMtx_preProcessed$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9969 0.9948 0.9936 0.9988 0.5452
## AccuracyPValue McnemarPValue
## 0.0000 NaN