I have a data frame that I need to pivot but the data frame has duplicate identifiers, so spread
function gives an error Error: Duplicate identifiers for rows (5, 6)
Dimension = c("A","A","B","B","A","A")
Date = c("Mon","Tue","Mon","Wed","Fri","Fri")
Metric = c(23,25,7,9,7,8)
df = data.frame(Dimension,Date,Metric)
df
Dimension Date Metric
1 A Mon 23
2 A Tue 25
3 B Mon 7
4 B Wed 9
5 A Fri 7
6 A Fri 8
library(tidyr)
df1 = spread(df, Date, Metric, fill = " ")
Error: Duplicate identifiers for rows (5, 6)
I then consolidated the rows and pasted the Metric
:
dfa = aggregate(df[3], df[-3],
FUN = function(X) paste(unique(X), collapse=", "))
Dimension Date Metric
1 A Fri 7, 8
2 A Mon 23
3 B Mon 7
4 A Tue 25
5 B Wed 9
Then repeat and of course it works now:
df1 = spread(dfa, Date, Metric, fill = " ")
df1
Dimension Fri Mon Tue Wed
1 A 7, 8 23 25
2 B 7 9
Question: is there an "easier" way of doing this, or is my method above efficient enough so I don't need to lose sleep over it? Thanks!
EDIT.
All codes - mine and 2 akrun's work fine with this small dataset. However, akrun's dplyr
version breaks on my real dataset. Here's dput
.
structure(list(Dimension = c(10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 10994030020, 10994030020, 10994030020, 10994030020,
10994030020, 12300245685, 12300245685, 12300245685, 12300245685,
12300245685, 12300245685, 12300245685, 12300245685, 12300245685,
12300245685, 12300245685, 12300245685, 12300245685, 12300245685,
12300245685, 12300245685, 12300245685, 12300245685, 12300245685,
12300245685, 12300245685, 12300245685, 12300245685, 12300245685,
12300245685, 12300245685, 12300245685, 12300245685, 12300245685,
12300245685, 12300245685, 12300245685, 12300245685, 12300245685,
12303485675, 12303485675, 12303485675, 12303485675, 12303485675,
12701670437, 12701670437, 12701670437, 12701670437, 12701670437,
12701670437, 12701670437, 12701670437, 12701670437, 12701670437,
12701670437, 12701670437, 12701670437, 12701670437, 12701670437,
12701670437), Date = structure(c(1L, 3L, 5L, 7L, 9L, 10L, 11L,
12L, 13L, 14L, 16L, 18L, 19L, 20L, 22L, 23L, 24L, 26L, 27L, 28L,
30L, 32L, 33L, 34L, 40L, 41L, 42L, 47L, 48L, 49L, 51L, 52L, 53L,
54L, 55L, 58L, 59L, 60L, 61L, 62L, 63L, 65L, 66L, 68L, 69L, 70L,
74L, 75L, 76L, 2L, 3L, 5L, 7L, 8L, 10L, 11L, 15L, 17L, 20L, 21L,
24L, 25L, 28L, 30L, 31L, 34L, 36L, 42L, 43L, 46L, 48L, 49L, 53L,
54L, 56L, 65L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 7L, 24L, 30L,
38L, 65L, 4L, 6L, 7L, 24L, 28L, 29L, 30L, 35L, 37L, 39L, 44L,
45L, 50L, 57L, 64L, 65L), .Label = c("16", "analog tuner", "aspect ratio",
"assembled in country of origin", "backlight technology", "battery type",
"brand", "brightness", "color class", "component video", "composite video",
"country of origin", "depth w/ stand", "digital audio output",
"digital tuner", "display technology", "features", "green compliance certificate/authority",
"green compliant", "hdmi", "headphone jack", "height w/ stand",
"limited warranty", "manufacturer", "maximum resolution", "media player",
"motion interpolation technology", "mpn", "multi pack indicator",
"name", "native contrast ratio", "number of hdmi ports", "number of usb ports",
"operating power consumption", "origin of components", "package contents",
"primary color", "product dimensions", "product in in (l x w x h)",
"product model", "product series", "product type", "remote control incl",
"remote included", "resolution", "response time", "rms output power",
"scan format", "screen size", "shipping weight (in lb)", "sound system",
"speaker output power (w)", "speakers", "standard refresh rate",
"standby power consumption", "total number of hdmi ports", "tv definition",
"tv features", "tv refresh rate (hz)", "tv resolution", "tv screen size (in)",
"tv screen size range", "tv speakers", "tv technology", "unspsc",
"usb", "vertical viewing angle", "vesa mount standard", "vga",
"video signal standard", "viewing angle", "warranty length",
"wattage", "weight (approx)", "weight w/ stand (approx)", "width w/ stand"
), class = "factor"), Metric = structure(c(40L, 13L, 57L, 69L,
43L, 72L, 72L, 45L, 38L, 72L, 55L, 44L, 72L, 72L, 15L, 3L, 69L,
72L, 46L, 26L, 70L, 27L, 1L, 29L, 26L, 54L, 58L, 12L, 39L, 25L,
42L, 11L, 72L, 37L, 28L, 52L, 36L, 39L, 24L, 19L, 72L, 33L, 72L,
18L, 72L, 49L, 6L, 10L, 23L, 62L, 13L, 48L, 64L, 31L, 72L, 72L,
41L, 66L, 72L, 72L, 64L, 16L, 63L, 65L, 4L, 32L, 21L, 58L, 71L,
35L, 8L, 20L, 72L, 37L, 17L, 33L, 14L, 7L, 72L, 50L, 14L, 2L,
34L, 59L, 59L, 60L, 5L, 33L, 51L, 47L, 67L, 67L, 53L, 61L, 68L,
51L, 43L, 30L, 72L, 9L, 22L, 49L, 56L, 33L), .Label = c("1",
"1-year limited", "1 Year", "1,000:1", "1,140 x 145 x 705 in ; 65.6 lb",
"10.40 lb", "100 x 100", "1080p", "1080p (HDTV)", "11.20 lb",
"14", "14 W", "16:9", "178 degrees", "18.30 in", "1920 x 1080",
"2", "200 x 100", "21", "22 in", "22 in FHD LED TV; Remote Control",
"25.4", "26.20 in", "29", "29 in", "29L1350U", "3", "300 mW",
"33.80 W", "36.5 x 6.5 x 23.0", "365 Nit", "50 W", "52161505",
"6 W", "6.50 ms", "60", "60 Hz", "7.10 in", "720p", "9", "ATSC",
"Audyssey EQ", "Black", "CEC", "China", "ClearScan 120 Hz", "Does Not Contain a Battery",
"Edge LED", "HDTV", "HDTV 1080p", "Imported", "Internet Apps",
"KDL40W600B", "L1350U", "LCD", "LCD, Internet Connected, LED",
"LED", "LED-LCD TV", "LG", "LG 47LY340C - 47 in - commercial use LED-backlit L",
"No", "NTSC", "PLED2243A", "ProScan", "PROSCAN PLED2243A 22 in 1080p 60 Hz LED HDTV - PTR",
"Sleep Timer; Auto Program", "Sony", "Sony KDL40W600B 40 in 1080p 60 Hz Smart LED TV (20",
"Toshiba", "Toshiba 29L1350U 29 in 720p LED-LCD TV - 16:9 - HD",
"yes", "Yes"), class = "factor")), .Names = c("Dimension", "Date",
"Metric"), class = c("data.table", "data.frame"), row.names = c(NA,
-104L), .internal.selfref = <pointer: 0x00000000003d0788>)
You could use dcast
from the devel version of data.table
ie. v1.9.5
. Instructions to install are here
library(data.table)#v1.9.5+
dcast(setDT(df), Dimension~Date, value.var='Metric',
fun.aggregate=function(x) toString(unique(x)))
# Dimension Fri Mon Tue Wed
#1: A 7, 8 23 25
#2: B 7 9
Or
library(dplyr)
library(tidyr)
df %>%
group_by(Dimension, Date) %>%
summarise(Metric=toString(unique(Metric))) %>%
spread(Date, Metric, fill='')
# Dimension Fri Mon Tue Wed
#1 A 7, 8 23 25
#2 B 7 9
Using the new dataset from `OP's post
setDF(df2)
df2 %>%
group_by(Dimension, Date) %>%
summarise(Metric=toString(unique(Metric))) %>%
spread(Date, Metric, fill='') %>%
head(2) %>%
select(1:3)
# Dimension 16 analog tuner
#1 10994030020 9
#2 12300245685 NTSC