Cluster und Korrelationen
From DHVLab
Das vorliegende, kommentierte Skript dient dazu, über das DHVLab mit R und dem Datensatz Artworks des Museum of Modern Art Analysen zum Themenkreis Cluster und Korrelationen durchzuführen.
In dieser Einheit verwendete Funktionen: read.csv
, gsub
, as.numeric
, plot
, abline
, points
, hist
, boxplot
, shapiro.test
, cor
, quantile
, IQR
.
Korrelationen
# Daten einlesen
url <- "https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv"
artworks <- read.csv(url)
# Übersicht über die Daten
colnames(artworks)
table(as.factor(artworks$Classification))
# Daten selektieren
paintings <- artworks[artworks$Classification == "Painting",]
paintings <- paintings[,c("Title", "Artist", "Date", "DateAcquired", "Height..cm.", "Width..cm.")]
#------------------------------------------------
# Zusammenhang Entstehungsdatum und Ankaufdatum
paintings$Date <- gsub(".*?([1-2][0-9]{3}).*", "\\1", paintings$Date)
paintings$Date <- as.numeric(paintings$Date)
paintings$DateAcquired <- gsub(".*?([1-2][0-9]{3}).*", "\\1", paintings$DateAcquired)
paintings$DateAcquired <- as.numeric(paintings$DateAcquired)
paintingsDate <- paintings[!is.na(paintings$Date) & !is.na(paintings$DateAcquired),]
# Streudiagramm
plot(paintingsDate$Date, paintingsDate$DateAcquired, main = "Zusammenhang Entstehungsdatum und Ankaufdatum", xlab = "Entstehungsdatum", ylab = "Ankaufdatum")
# Verteilungen der Variablen
hist(paintingsDate$Date)
boxplot(paintingsDate$Date)
shapiro.test(paintingsDate$Date)
hist(paintingsDate$DateAcquired)
boxplot(paintingsDate$DateAcquired)
shapiro.test(paintingsDate$DateAcquired)
# Korrelation mit Kendalls Tau (Moderater Zusammenhang), da metrische, nicht normalverteilte Variablen
cor(paintingsDate$Date, paintingsDate$DateAcquired, method = "kendall")
#------------------------------------------------
# Zusammenhang Höhe und Breite
paintingsDim <- paintings[!is.na(paintings$Height..cm.) & !is.na(paintings$Width..cm.),]
# Streudiagramm
plot(paintingsDim$Width..cm., paintingsDim$Height..cm., main = "Zusammenhang Breite und Höhe", xlab = "Breite", ylab = "Höhe")
abline(lm(paintingsDim$Height..cm. ~ paintingsDim$Width..cm.), col = "red")
# Ausreißer kennzeichnen
qWidth <- quantile(paintingsDim$Width..cm., 0.75)
iqrWidth <- IQR(paintingsDim$Width..cm.)*1.5
qHeight <- quantile(paintingsDim$Height..cm., 0.75)
iqrHeight <- IQR(paintingsDim$Height..cm.)*1.5
outliers <- paintingsDim[paintingsDim$Width..cm. > (qWidth + iqrWidth) | paintingsDim$Height..cm. > (qHeight + iqrHeight),]
points(outliers$Width..cm., outliers$Height..cm., col = "green")
# Verteilungen der Variablen
hist(paintingsDim$Width..cm.)
boxplot(paintingsDim$Width..cm.)
shapiro.test(paintingsDim$Width..cm.)
hist(paintingsDim$Height..cm.)
boxplot(paintingsDim$Height..cm.)
shapiro.test(paintingsDim$Height..cm.)
# Korrelation mit Kendalls Tau (Starker Zusammenhang), da metrische, nicht normalverteilte Variablen
cor(paintingsDim$Width..cm., paintingsDim$Height..cm., method = "kendall")
Cluster
Coming Soon.