# Victim death dates, from Wikipedia, http://en.wikipedia.org/wiki/Andrei_Chikatilo # Note: Actual dates are unknown for several victims # no. 10 is "After 18 June 1983" (recorded here as 18 June 1983) # nos. 11 and 12 are known only as "July 1983" (used 1 and 15 July) # no. 14 is "After 19 September 1983" (used 19 September) # no. 15 is "summer 1983", used 15 August # Victims nos. 21 and 22 were killed togehter on the same day; since # Simkin and Roychodwdhury only analyze days _between_ murders, I comment # out the 2nd entry # no. 26 is "July 1984", used 26 July # no. 29 is "8--11 August 1984", used 10 August # no. 38 is "1--4 April 1988", used 2 April # A little tinkering shows not much sensitivity in to these guesses murder.dates <- c("22 December 1978", "03 September 1981", "12 June 1982", "25 July 1982", "13 August 1982", "16 August 1982", "08 September 1982", "15 September 1982", "11 December 1982", "18 June 1983", "01 July 1983", "15 July 1983", "09 August 1983", "15 August 1983", "19 September 1983", "27 October 1983", "27 December 1983", "09 January 1984", "21 February 1984", "24 March 1984", "25 May 1984", # "25 May 1984", "22 June 1984", "10 July 1984", "19 July 1984", "26 July 1984", "02 August 1984", "07 August 1984", "10 August 1984", "13 August 1984", "28 August 1984", "06 September 1984", "31 July 1985", "27 August 1985", "16 May 1987", "29 July 1987", "15 September 1987", "02 April 1988", "15 May 1988", "14 July 1988", "08 March 1989", "11 May 1989", "20 June 1989", "19 August 1989", "28 August 1989", "14 January 1990", "07 March 1990", "04 April 1990", "28 July 1990", "14 August 1990", "17 October 1990", "30 October 1990", "06 November 1990") # Convert calendar dates to days since the beginning of the system epoch murder.epoch.days <- as.Date(murder.dates,format="%d %B %Y") # Take first differences to get days between murders murders <- diff(as.numeric(murder.epoch.days)) # Source the files for power-law display and estimation # Readers can get these from http://tuvalu.santafe.edu/~aaronc/powerlaws/ source("~/things-to-work-on/pli-R/pareto.R") source("~/things-to-work-on/pli-R/lnorm.R") source("~/things-to-work-on/pli-R/power-law-test.R") # Plot the cumulative distribution function of the data plot.eucdf.loglog(murders,xlab="Days between muders", ylab="Cumulative probability") # Fit power law and log-normal distributions # Ignores the fact that continuous times are rounded to days; does not # seem to make much difference when experimented with murders.pareto <- pareto.fit(murders,threshold=3) murders.lnorm <- lnorm.fit(murders,threshold=3) # Plot the fitted power law in red curve(ppareto(x,exponent=murders.pareto$exponent,threshold=3,lower.tail=FALSE), add=TRUE,col="red") # Estimated exponent is 1.42, agreeing very well with the 1.4 reported by # Simkin and Roychowdhury, so my assignment of specific values to uncertain # dates can't be too far off from theirs # Plot the fitted log normal curve(plnorm(x,meanlog=murders.lnorm$meanlog,sdlog=murders.lnorm$sdlog, lower.tail=FALSE)/plnorm(3,meanlog=murders.lnorm$meanlog, sdlog=murders.lnorm$sdlog,lower.tail=FALSE),add=TRUE,col="blue") # Do the hypothesis test hyp.test <- vuong(pareto.lnorm.llr(murders,murders.pareto,murders.lnorm)) # goodness-of-fit test: # we have a fixed threshold (==3), so the regular code for the GoF test is # inappropriate # MEMO TO SELF: Modify released code so it handles a fixed threshold! ks.pareto.fixed.threshold <- function(x,threshold) { return(pareto.fit(x,threshold=threshold)$ks.dist) } ks.pvalue.pareto.fixed.threshold <- function(x,threshold,m) { fit <- pareto.fit(x,threshold=threshold) n <- fit$samples.over.threshold exponent <- fit$exponent ks <- fit$ks.dist d.ks <- replicate(m, ks.pareto.fixed.threshold(rpareto(n,threshold=threshold,exponent=exponent), threshold=threshold)) p.value <- sum(d.ks >= ks)/m return(p.value) } p <- ks.pvalue.pareto.fixed.threshold(murders,3,1e5) # approx. 3e-4 when I run it. ### Addendum, 21 January 2012: bootstrap confidence interval for exponent require(boot) murders.mle.boot <- boot(murders, statistic=function(data,original){ pareto.fit(data[original],threshold=3)$exponent }, R=1000) # i.e. 1000 replicates # 95% CI, using the "basic" or "pivotal" method 2*murders.pareto$exponent - quantile(murders.mle.boot$t,c(0.025,0.975)) # I get a CI of (1.35,1.48)