Cluster und Korrelationen

From DHVLab

Das vorliegende, kommentierte Skript dient dazu, über das DHVLab mit R und dem Datensatz Artworks des Museum of Modern Art Analysen zum Themenkreis Cluster und Korrelationen durchzuführen.

In dieser Einheit verwendete Funktionen: read.csv, gsub, as.numeric, plot, abline, points, hist, boxplot, shapiro.test, cor, quantile, IQR.

Korrelationen

# Daten einlesen
url <- "https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv"
artworks <- read.csv(url)

# Übersicht über die Daten
colnames(artworks)
table(as.factor(artworks$Classification))

# Daten selektieren
paintings <- artworks[artworks$Classification == "Painting",]
paintings <- paintings[,c("Title", "Artist", "Date", "DateAcquired", "Height..cm.", "Width..cm.")]

#------------------------------------------------

# Zusammenhang Entstehungsdatum und Ankaufdatum
paintings$Date <- gsub(".*?([1-2][0-9]{3}).*", "\\1", paintings$Date)
paintings$Date <- as.numeric(paintings$Date)

paintings$DateAcquired <- gsub(".*?([1-2][0-9]{3}).*", "\\1", paintings$DateAcquired)
paintings$DateAcquired <- as.numeric(paintings$DateAcquired)

paintingsDate <- paintings[!is.na(paintings$Date) & !is.na(paintings$DateAcquired),]

# Streudiagramm
plot(paintingsDate$Date, paintingsDate$DateAcquired, main = "Zusammenhang Entstehungsdatum und Ankaufdatum", xlab = "Entstehungsdatum", ylab = "Ankaufdatum")

# Verteilungen der Variablen
hist(paintingsDate$Date)
boxplot(paintingsDate$Date)
shapiro.test(paintingsDate$Date)

hist(paintingsDate$DateAcquired)
boxplot(paintingsDate$DateAcquired)
shapiro.test(paintingsDate$DateAcquired)

# Korrelation mit Kendalls Tau (Moderater Zusammenhang), da metrische, nicht normalverteilte Variablen
cor(paintingsDate$Date, paintingsDate$DateAcquired, method = "kendall")

#------------------------------------------------

# Zusammenhang Höhe und Breite
paintingsDim <- paintings[!is.na(paintings$Height..cm.) & !is.na(paintings$Width..cm.),]

# Streudiagramm
plot(paintingsDim$Width..cm., paintingsDim$Height..cm., main = "Zusammenhang Breite und Höhe", xlab = "Breite", ylab = "Höhe")
abline(lm(paintingsDim$Height..cm. ~ paintingsDim$Width..cm.), col = "red")

# Ausreißer kennzeichnen
qWidth <- quantile(paintingsDim$Width..cm., 0.75)
iqrWidth <- IQR(paintingsDim$Width..cm.)*1.5
qHeight <- quantile(paintingsDim$Height..cm., 0.75)
iqrHeight <- IQR(paintingsDim$Height..cm.)*1.5

outliers <- paintingsDim[paintingsDim$Width..cm. > (qWidth + iqrWidth) | paintingsDim$Height..cm. > (qHeight + iqrHeight),]
points(outliers$Width..cm., outliers$Height..cm., col = "green")

# Verteilungen der Variablen
hist(paintingsDim$Width..cm.)
boxplot(paintingsDim$Width..cm.)
shapiro.test(paintingsDim$Width..cm.)

hist(paintingsDim$Height..cm.)
boxplot(paintingsDim$Height..cm.)
shapiro.test(paintingsDim$Height..cm.)

# Korrelation mit Kendalls Tau (Starker Zusammenhang), da metrische, nicht normalverteilte Variablen
cor(paintingsDim$Width..cm., paintingsDim$Height..cm., method = "kendall")

Cluster

Coming Soon.