I found this old thread while I was trying to do something very similar.
I used the outline from the comments below (thanks!) to come up with the following code that seems to work OK.
I have made no effort to make it a clean universal function, so I apologize for the overuse of table and field names (and some data conditions) that are unique to the data I'm using.
It might have been better to write this as a function that has a data set passed to it as a parameter, and it returns a dataset with outliers removed; but I did want to keep a record of what points were deleted.
I used Chauvenet's criterion to remove outliers (rather than just a fixed number of std deviations away from the mean). I added a 2 x multiplier in there to take into account that I'm looking at the furthest point from the mean each time, and that this could be to the left or the right of the mean - I think I've interpreted Chauvenet's criterion correctly, but I wouldn't guarantee it.
And, thanks to the author of the NormsDist function.
CODE
Function removeOutliers() As Boolean
Dim db As Database
Set db = CurrentDb()
Dim qtemp, q2temp, q3temp As QueryDef
Dim rtemp, r2temp, r3temp As Recordset
Dim sql As String
Dim p As Double
'empty the previous costs table
sql = "DELETE * FROM tblCOGSoutlier"
db.Execute (sql)
'put data into the table where qty>0, costs >0, inv date is after Jun 2011 (prior data suspect)
sql = "INSERT INTO tblCOGSoutlier ( PartNum, InvDate, CostEa, fromQty, FlaggedOutlier )"
sql = sql & " SELECT tblCOGSMkTbl.[Part Num], CDate([Inv Date]) AS InvDate, [total cost extended]/[qty shipped] AS CostEa,"
sql = sql & " tblCOGSMkTbl.[Qty Shipped], False AS Expr1 FROM tblCOGSMkTbl"
sql = sql & " WHERE (((CDate([Inv Date]))>=#Jul/1/2011#) AND (([total cost extended]/[qty shipped])>0) AND ((tblCOGSMkTbl.[Qty Shipped])>0));"
db.Execute (sql)
'inflate for multiples?
'not sure about this.... will make for clustered and non normal distribution, but high qtys are more important than low qtys....
'after some thought, strategy is remove outliers before considering quantities per data point
'for each part number
sql = "SELECT tblCOGSoutlier.PartNum FROM tblCOGSoutlier GROUP BY tblCOGSoutlier.PartNum;"
Set qtemp = db.CreateQueryDef("", sql)
Set rtemp = qtemp.OpenRecordset(DB_OPEN_SNAPSHOT)
While Not rtemp.EOF
'get the mean and stddev for the costs not flagged as outliers
sql = "SELECT tblCOGSoutlier.PartNum, StDev(tblCOGSoutlier.[CostEa]) AS StDevOfCostEa, "
sql = sql & "Avg(tblCOGSoutlier.CostEa) AS AvgOfCostEa, Count(CostEa) AS CountOfCost FROM tblCOGSoutlier "
sql = sql & "where (((tblCOGSoutlier.PartNum) Like """ & rtemp!PartNum & """) And "
sql = sql & "((tblCOGSoutlier.FlaggedOutlier) = No)) GROUP BY tblCOGSoutlier.PartNum;"
Set q2temp = db.CreateQueryDef("", sql)
Set r2temp = q2temp.OpenRecordset(DB_OPEN_SNAPSHOT)
'get the cost record for this item where the point is furthest from the mean
'select top 1 ordered by abs(cost-mean)
sql = "SELECT TOP 1 tblCOGSoutlier.LineID, tblCOGSoutlier.FlaggedOutlier, tblCOGSoutlier.PartNum, "
sql = sql & " tblCOGSoutlier.CostEa, Abs([costEa]-" & r2temp!AvgOfCostEa & ") AS Away"
sql = sql & " FROM tblCOGSoutlier where (((tblCOGSoutlier.PartNum) Like """ & rtemp!PartNum & """) And "
sql = sql & "((tblCOGSoutlier.FlaggedOutlier) = No)) ORDER BY Abs([costEa]-" & r2temp!AvgOfCostEa & ") DESC;"
Set q3temp = db.CreateQueryDef("", sql)
Set r3temp = q3temp.OpenRecordset(dbOpenDynaset, dbInconsistent, dbOptimistic)
'while the point is too far away....
'MsgBox (r2temp!PartNum & Chr$(13) & "Away: " & r3temp!away & Chr$(13) & "Ave: " & r2temp!AvgOfCostEa & Chr$(13) & "SDev: " & r2temp!StDevOfCostEa)
If Not (IsNull(r2temp!StDevOfCostEa)) Then
If (r2temp!StDevOfCostEa > 0) Then
'using Chauvenet's criterion.....
p = 2 * (1 - SNorm2(r3temp!away / r2temp!StDevOfCostEa)) * r2temp!CountOfCOst
'MsgBox (r2temp!PartNum & Chr$(13) & "Away: " & r3temp!away & Chr$(13) & "Ave: " & r2temp!AvgOfCostEa & Chr$(13) & "SDev: " & r2temp!StDevOfCostEa & Chr$(13) & "Count: " & r2temp!CountOfCOst & Chr$(13) & "P: " & p)
While p < 0.5
'flag the point
r3temp.MoveFirst
r3temp.Edit
r3temp!FlaggedOutlier = True
r3temp.Update
'get the records that drop this point
sql = "SELECT TOP 1 tblCOGSoutlier.LineID, tblCOGSoutlier.FlaggedOutlier, tblCOGSoutlier.PartNum, "
sql = sql & " tblCOGSoutlier.CostEa, Abs([costEa]-" & r2temp!AvgOfCostEa & ") AS Away"
sql = sql & " FROM tblCOGSoutlier where (((tblCOGSoutlier.PartNum) Like """ & rtemp!PartNum & """) And "
sql = sql & "((tblCOGSoutlier.FlaggedOutlier) = No)) ORDER BY Abs([costEa]-" & r2temp!AvgOfCostEa & ") DESC;"
Set q3temp = db.CreateQueryDef("", sql)
Set r3temp = q3temp.OpenRecordset(dbOpenDynaset, dbInconsistent, dbOptimistic)
'recalculate p
p = 2 * (1 - SNorm2(r3temp!away / r2temp!StDevOfCostEa)) * r2temp!CountOfCOst
Wend
End If
End If
'to get here, only non outliers are left....
'get the next part number
rtemp.MoveNext
Wend
r3temp.Close
q3temp.Close
r2temp.Close
q2temp.Close
rtemp.Close
qtemp.Close
removeOutliers = True
End Function
'***********************************************************************
'* Cumulative Standard Normal Distribution *
'* (this function provides similar result as NORMSDIST( ) on Excel) *
'* Source: http://www.geocities.com/WallStreet/9245/vba6.htm *
'***********************************************************************
Public Function SNorm2(z As Double) As Double
Const c1 = 2.506628
Const c2 = 0.3193815
Const c3 = -0.3565638
Const c4 = 1.7814779
Const c5 = -1.821256
Const c6 = 1.3302744
Dim w As Double, x As Double, y As Double
If z > 0 Or z = 0 Then
w = 1
Else
w = -1
End If
y = 1 / (1 + 0.231649 * w * z)
x = c6
x = y * x + c5
x = y * x + c4
x = y * x + c3
x = y * x + c2
SNorm2 = 0.5 + w * (0.5 - (Exp(-z * z / 2) / c1) * y * x)
End Function