Das vorliegende, kommentierte Skript dient dazu, über das DHVLab mit R und dem Datensatz Artworks des Museum of Modern Art Analysen zum Themenkreis Cluster und Korrelationen durchzuführen.
In dieser Einheit verwendete Funktionen: read.csv
,
gsub
, as.numeric
, plot
,
abline
, points
, hist
,
boxplot
, shapiro.test
, cor
,
quantile
, IQR
.
# Daten einlesen
url <-
"https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv"
artworks <- read.csv(url)
# Übersicht über die Daten
colnames(artworks)
table(as.factor(artworks$Classification))
# Daten selektieren
paintings <- artworks[artworks$Classification == "Painting",]
paintings <- paintings[,c("Title", "Artist",
"Date", "DateAcquired", "Height..cm.",
"Width..cm.")]
#------------------------------------------------
# Zusammenhang Entstehungsdatum und Ankaufdatum
paintings$Date <- gsub(".*?([1-2][0-9]{3}).*", "\\1",
paintings$Date)
paintings$Date <- as.numeric(paintings$Date)
paintings$DateAcquired <- gsub(".*?([1-2][0-9]{3}).*", "\\1",
paintings$DateAcquired)
paintings$DateAcquired <- as.numeric(paintings$DateAcquired)
paintingsDate <- paintings[!is.na(paintings$Date) &
!is.na(paintings$DateAcquired),]
# Streudiagramm
plot(paintingsDate$Date, paintingsDate$DateAcquired, main = "Zusammenhang
Entstehungsdatum und Ankaufdatum", xlab = "Entstehungsdatum", ylab =
"Ankaufdatum")
# Verteilungen der Variablen
hist(paintingsDate$Date)
boxplot(paintingsDate$Date)
shapiro.test(paintingsDate$Date)
hist(paintingsDate$DateAcquired)
boxplot(paintingsDate$DateAcquired)
shapiro.test(paintingsDate$DateAcquired)
# Korrelation mit Kendalls Tau (Moderater Zusammenhang), da metrische, nicht
normalverteilte Variablen
cor(paintingsDate$Date, paintingsDate$DateAcquired, method = "kendall")
#------------------------------------------------
# Zusammenhang Höhe und Breite
paintingsDim <- paintings[!is.na(paintings$Height..cm.) &
!is.na(paintings$Width..cm.),]
# Streudiagramm
plot(paintingsDim$Width..cm., paintingsDim$Height..cm., main = "Zusammenhang
Breite und Höhe", xlab = "Breite", ylab = "Höhe")
abline(lm(paintingsDim$Height..cm. ~ paintingsDim$Width..cm.), col =
"red")
# Ausreißer kennzeichnen
qWidth <- quantile(paintingsDim$Width..cm., 0.75)
iqrWidth <- IQR(paintingsDim$Width..cm.)*1.5
qHeight <- quantile(paintingsDim$Height..cm., 0.75)
iqrHeight <- IQR(paintingsDim$Height..cm.)*1.5
outliers <- paintingsDim[paintingsDim$Width..cm. > (qWidth + iqrWidth) |
paintingsDim$Height..cm. > (qHeight + iqrHeight),]
points(outliers$Width..cm., outliers$Height..cm., col = "green")
# Verteilungen der Variablen
hist(paintingsDim$Width..cm.)
boxplot(paintingsDim$Width..cm.)
shapiro.test(paintingsDim$Width..cm.)
hist(paintingsDim$Height..cm.)
boxplot(paintingsDim$Height..cm.)
shapiro.test(paintingsDim$Height..cm.)
# Korrelation mit Kendalls Tau (Starker Zusammenhang), da metrische, nicht
normalverteilte Variablen
cor(paintingsDim$Width..cm., paintingsDim$Height..cm., method = "kendall")
Coming Soon.