dataframe - r data.frame create new variable -
i have dataframe around 1.5 million rows , 5 cols. 1 variable (variable) of type nationality_year (e.g. spain_1998) , want split in 2 columns, 1 containing nationality, left side of name before underscore, , 1 containing year, right side of underscore. have tried concat.split should easiest way:
aa <- concat.split(mydata, "variable", sep = "_", drop = f) but after 2 hours running did not produce output. not sure if should leave running longer period of time or if there non time consuming way this.
any on issue appreciated!
here reproducible (subset!) sample:
mydata<- structure(list(province = c(1l, 4l, 7l, 8l, 11l, 14l, 17l, 20l, 24l, 28l, 30l, 33l, 36l, 41l, 44l, 46l, 48l, 3l, 6l, 8l, 10l, 13l, 15l, 18l, 23l, 26l, 29l, 31l, 35l, 38l, 41l, 46l, 47l, 2l, 4l, 8l, 8l, 11l, 15l, 17l, 21l, 24l, 28l, 30l, 33l, 37l, 41l, 45l, 46l, 49l, 3l, 6l, 8l, 10l, 13l, 15l, 19l, 23l, 27l, 29l, 32l, 36l, 39l, 43l, 46l, 48l, 2l, 5l, 8l, 8l, 12l, 15l, 18l, 21l, 24l, 28l, 30l, 33l, 37l, 41l, 45l, 46l, 50l, 3l, 7l, 8l, 10l, 14l, 16l, 20l, 23l, 27l, 29l, 32l, 36l, 39l, 43l, 46l, 48l, 3l, 6l, 8l, 8l, 12l, 15l, 18l, 21l, 25l, 28l, 31l, 34l, 38l, 41l, 45l, 46l, 50l, 3l, 7l, 8l, 11l, 14l, 17l, 20l, 23l, 27l, 29l, 33l, 36l, 40l, 43l, 46l, 48l, 3l, 6l, 8l, 9l, 12l, 15l, 18l, 22l, 25l, 28l, 31l, 35l, 38l, 41l, 45l, 46l, 50l, 4l, 7l, 8l, 11l, 14l, 17l, 20l, 24l, 28l, 30l, 33l, 36l, 41l, 43l, 46l, 48l, 3l, 6l, 8l, 10l, 13l, 15l, 18l, 22l, 26l, 28l, 31l, 35l, 38l, 41l, 46l, 47l, 1l, 4l, 8l, 8l, 11l, 14l, 17l, 20l, 24l, 28l, 30l, 33l, 36l, 41l, 44l, 46l, 49l, 3l, 6l), age5 = structure(c(1l, 5l, 9l, 7l, 6l, 7l, 5l, 8l, 3l, 3l, 3l, 5l, 8l, 2l, 3l, 6l, 9l, 5l, 7l, 4l, 3l, 5l, 8l, 8l, 2l, 8l, 2l, 9l, 7l, 9l, 9l, 2l, 7l, 2l, 9l, 1l, 8l, 8l, 1l, 8l, 1l, 6l, 4l, 6l, 7l, 2l, 3l, 1l, 7l, 5l, 6l, 9l, 5l, 6l, 8l, 9l, 3l, 4l, 3l, 4l, 4l, 1l, 3l, 1l, 2l, 2l, 6l, 6l, 2l, 9l, 2l, 2l, 1l, 5l, 9l, 5l, 8l, 9l, 7l, 4l, 3l, 7l, 2l, 8l, 2l, 6l, 9l, 1l, 5l, 1l, 6l, 6l, 6l, 7l, 3l, 6l, 3l, 3l, 4l, 1l, 1l, 2l, 9l, 6l, 4l, 3l, 8l, 3l, 7l, 1l, 5l, 2l, 6l, 6l, 8l, 5l, 9l, 5l, 6l, 2l, 3l, 1l, 4l, 8l, 9l, 8l, 1l, 5l, 1l, 6l, 4l, 6l, 2l, 3l, 3l, 5l, 9l, 5l, 5l, 4l, 7l, 8l, 4l, 2l, 5l, 7l, 8l, 9l, 8l, 3l, 7l, 7l, 5l, 6l, 3l, 6l, 1l, 2l, 2l, 3l, 7l, 1l, 9l, 5l, 8l, 4l, 5l, 4l, 1l, 3l, 7l, 7l, 9l, 3l, 9l, 7l, 5l, 7l, 8l, 1l, 4l, 4l, 6l, 1l, 8l, 7l, 8l, 6l, 8l, 4l, 3l, 4l, 5l, 9l, 2l, 6l, 6l, 1l, 5l, 7l), .label = c("10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"), zona91ok = c(101l, 4079l, 712l, 8205l, 11022l, 14021l, 1714l, 20067l, 2414l, 2810l, 300799l, 3305l, 36026l, 41024l, 4405l, 4607l, 48015l, 308l, 610l, 8121l, 1006l, 1307l, 1511l, 1813l, 2308l, 2605l, 2910l, 310799l, 35026l, 3811l, 411199l, 4601l, 4708l, 202l, 405l, 8015l, 837l, 11033l, 1502l, 1702l, 2112l, 2408l, 28047l, 30015l, 3305l, 3709l, 410199l, 4511l, 1202l, 490699l, 3063l, 610l, 827l, 1006l, 1301l, 15036l, 1901l, 2310l, 2709l, 29025l, 3201l, 36008l, 390899l, 4301l, 46184l, 4805l, 206l, 504l, 817l, 813l, 12135l, 1519l, 1810l, 2104l, 2402l, 28130l, 30030l, 3305l, 3707l, 411399l, 45165l, 46181l, 5008l, 305l, 7026l, 803l, 1006l, 1413l, 16078l, 200999l, 2312l, 2712l, 29069l, 3210l, 3616l, 391199l, 4313l, 46105l, 4805l, 310l, 6153l, 8252l, 8205l, 1205l, 1505l, 1808l, 2110l, 2508l, 2810l, 311399l, 3405l, 3807l, 41024l, 4507l, 46102l, 500599l, 3014l, 706l, 8121l, 11028l, 14042l, 1712l, 20045l, 2314l, 27031l, 29901l, 33024l, 3614l, 400199l, 4307l, 46021l, 4805l, 3066l, 6153l, 8015l, 901l, 12040l, 1522l, 1806l, 2203l, 2508l, 28047l, 311099l, 35004l, 3801l, 410199l, 4515l, 46017l, 501199l, 407l, 7027l, 827l, 1102l, 1404l, 17155l, 200599l, 24089l, 2812l, 30019l, 33024l, 3612l, 41038l, 4301l, 4628l, 4805l, 307l, 6153l, 817l, 1004l, 1309l, 1508l, 1804l, 2206l, 2606l, 28130l, 310799l, 35011l, 38022l, 411399l, 4622l, 4701l, 1036l, 4079l, 807l, 803l, 1108l, 1410l, 1708l, 201399l, 2410l, 28058l, 30043l, 33024l, 3610l, 410399l, 4401l, 4621l, 490499l, 3059l, 6153l), variable = structure(c(1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 2l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 3l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 4l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 5l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 6l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 7l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 8l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 9l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 10l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 11l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 12l, 13l, 13l), .label = c("spain_1998", "eu15dc_1998", "roe_1998", "magreb_1998", "ssa_1998", "la_1998", "asia_1998", "row_1998", "total_1998", "spain_1999", "eu15dc_1999", "roe_1999", "magreb_1999", "ssa_1999", "la_1999", "asia_1999", "row_1999", "total_1999", "spain_2000", "eu15dc_2000", "roe_2000", "magreb_2000", "ssa_2000", "la_2000", "asia_2000", "row_2000", "total_2000", "spain_2001", "eu15dc_2001", "roe_2001", "magreb_2001", "ssa_2001", "la_2001", "asia_2001", "row_2001", "total_2001", "spain_2002", "eu15dc_2002", "roe_2002", "magreb_2002", "ssa_2002", "la_2002", "asia_2002", "row_2002", "total_2002", "spain_2003", "eu15dc_2003", "roe_2003", "magreb_2003", "ssa_2003", "la_2003", "asia_2003", "row_2003", "total_2003", "spain_2004", "eu15dc_2004", "roe_2004", "magreb_2004", "ssa_2004", "la_2004", "asia_2004", "row_2004", "total_2004", "spain_2005", "eu15dc_2005", "roe_2005", "magreb_2005", "ssa_2005", "la_2005", "asia_2005", "row_2005", "total_2005", "spain_2006", "eu15dc_2006", "roe_2006", "magreb_2006", "ssa_2006", "la_2006", "asia_2006", "row_2006", "total_2006", "spain_2007", "eu15dc_2007", "roe_2007", "magreb_2007", "ssa_2007", "la_2007", "asia_2007", "row_2007", "total_2007", "spain_2008", "eu15dc_2008", "roe_2008", "magreb_2008", "ssa_2008", "la_2008", "asia_2008", "row_2008", "total_2008", "spain_2009", "eu15dc_2009", "roe_2009", "magreb_2009", "ssa_2009", "la_2009", "asia_2009", "row_2009", "total_2009", "spain_2010", "eu15dc_2010", "roe_2010", "magreb_2010", "ssa_2010", "la_2010", "asia_2010", "row_2010", "total_2010", "spain_2011", "eu15dc_2011", "roe_2011", "magreb_2011", "ssa_2011", "la_2011", "asia_2011", "row_2011", "total_2011", "spain_2012", "eu15dc_2012", "roe_2012", "magreb_2012", "ssa_2012", "la_2012", "asia_2012", "row_2012", "total_2012", "notspain_1998", "notspain_1999", "notspain_2000", "notspain_2001", "notspain_2002", "notspain_2003", "notspain_2004", "notspain_2005", "notspain_2006", "notspain_2007", "notspain_2008", "notspain_2009", "notspain_2010", "notspain_2011", "notspain_2012", "africa_1998", "africa_1999", "africa_2000", "africa_2001", "africa_2002", "africa_2003", "africa_2004", "africa_2005", "africa_2006", "africa_2007", "africa_2008", "africa_2009", "africa_2010", "africa_2011", "africa_2012", "dwc_1998", "dwc_1999", "dwc_2000", "dwc_2001", "dwc_2002", "dwc_2003", "dwc_2004", "dwc_2005", "dwc_2006", "dwc_2007", "dwc_2008", "dwc_2009", "dwc_2010", "dwc_2011", "dwc_2012"), class = "factor"), frequency = c(614, 1943, 59, 201, 188, 10859, 93, 1494, 60, 1001, 1000, 689, 675, 934, 51, 1240, 165, 13, 0, 14, 2, 2, 2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0, 0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5, 64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839, 4303, 100, 1727, 663, 694, 1210, 623, 1261, 772, 697, 490, 1031, 490, 956, 704, 1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528, 3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3, 12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0, 0, 0, 5, 2)), .names = c("province", "age5", "zona91ok", "variable", "frequency"), row.names = c(1l, 501l, 1001l, 1501l, 2001l, 2501l, 3001l, 3501l, 4001l, 4501l, 5001l, 5501l, 6001l, 6501l, 7001l, 7501l, 8001l, 8501l, 9001l, 9501l, 10001l, 10501l, 11001l, 11501l, 12001l, 12501l, 13001l, 13501l, 14001l, 14501l, 15001l, 15501l, 16001l, 16501l, 17001l, 17501l, 18001l, 18501l, 19001l, 19501l, 20001l, 20501l, 21001l, 21501l, 22001l, 22501l, 23001l, 23501l, 24001l, 24501l, 25001l, 25501l, 26001l, 26501l, 27001l, 27501l, 28001l, 28501l, 29001l, 29501l, 30001l, 30501l, 31001l, 31501l, 32001l, 32501l, 33001l, 33501l, 34001l, 34501l, 35001l, 35501l, 36001l, 36501l, 37001l, 37501l, 38001l, 38501l, 39001l, 39501l, 40001l, 40501l, 41001l, 41501l, 42001l, 42501l, 43001l, 43501l, 44001l, 44501l, 45001l, 45501l, 46001l, 46501l, 47001l, 47501l, 48001l, 48501l, 49001l, 49501l, 50001l, 50501l, 51001l, 51501l, 52001l, 52501l, 53001l, 53501l, 54001l, 54501l, 55001l, 55501l, 56001l, 56501l, 57001l, 57501l, 58001l, 58501l, 59001l, 59501l, 60001l, 60501l, 61001l, 61501l, 62001l, 62501l, 63001l, 63501l, 64001l, 64501l, 65001l, 65501l, 66001l, 66501l, 67001l, 67501l, 68001l, 68501l, 69001l, 69501l, 70001l, 70501l, 71001l, 71501l, 72001l, 72501l, 73001l, 73501l, 74001l, 74501l, 75001l, 75501l, 76001l, 76501l, 77001l, 77501l, 78001l, 78501l, 79001l, 79501l, 80001l, 80501l, 81001l, 81501l, 82001l, 82501l, 83001l, 83501l, 84001l, 84501l, 85001l, 85501l, 86001l, 86501l, 87001l, 87501l, 88001l, 88501l, 89001l, 89501l, 90001l, 90501l, 91001l, 91501l, 92001l, 92501l, 93001l, 93501l, 94001l, 94501l, 95001l, 95501l, 96001l, 96501l, 97001l, 97501l, 98001l, 98501l, 99001l, 99501l), class = "data.frame")
try instead:
library(data.table) dt = data.table(mydata) dt[, `:=`(nationality = sub('(.*)_(.*)', '\\1', variable), year = sub('(.*)_(.*)', '\\2', variable))]
Comments
Post a Comment