How to split CSV file and create multiple CSV files based on a column
Try:
awk -F, -v yesterday="$(date -d'-1day' +'%F')" '
BEGIN{ for(min=0; min<1440; min++){
mins = "date +%F\" " "\"%T -d\"" min "minutes" yesterday"\""
mins |getline yday_tmp; close(mins);
timestamp["\"" yday_tmp "\""] }
}
NR==1{ hdr=$0; next }
($1 in timestamp){
cp=$1; gsub(/[-": ]/, "", cp);
print hdr ORS $0 >(cp".csv");
close(cp".csv");
delete timestamp[$1] }
END{ for (x in timestamp){
cpx=x; gsub(/[-": ]/, "", cpx);
print hdr ORS x ",0,0,0,0" >(cpx".csv")
close(cpx".csv")
}
}' infile
Using GNU awk
for the strftime() and mktime() functions to reduce the execution time for generating the timestamps instead of calling external date
command and also store the files in separate day directory and remove all double quotes:
gawk -F, '
BEGIN{ start=strftime("%Y %m %d 00 00 00", systime()-86400);
for(min=0; min<1440; min++)
timestamp[strftime("%F %H:%M", mktime(start)+min*60)]
}
{ gsub(/"/,"") }
NR==1{
hdr=$0; yday=strftime("dir_%Y%m%d", systime()-86400);
system("mkdir "yday); next
}
(substr($1,1,16) in timestamp){
cp=$1; gsub(/[-: ]|00$/, "", cp);
print hdr ORS $0 >(yday"/"cp".csv");
close(yday"/"cp".csv");
delete timestamp[substr($1,1,16)] }
END{ for (x in timestamp){
cpx=x; gsub(/[-: ]/, "", cpx);
print hdr ORS x ",0,0,0,0" >(yday"/"cpx".csv");
close(yday"/"cpx".csv")
}
}' infile
As in GNU awk
documentation:
systime()
Return the current time of day as the number of seconds since the Epoch (1970-01-01 00:00:00 UTC on POSIX systems).
Let's print it:
$ awk 'BEGIN{ print systime() }'
1614100199
mktime(timestamp)
Turn the timestamp in the format of the YYYY MM DD HH MM SS
into the epoch time.
Let's print it;
$ awk 'BEGIN{ print mktime("2021 02 22 00 00 00") }'
1613939400
strftime(format, timestamp)
: Format timestamp according to the specification in format. The timestamp should be
in the epoch type.
Let's format a timestamp:
$ awk 'BEGIN{ print strftime("%Y-%m-%d %H:%M:%S", mktime("2021 02 23 01 02 00")) }'
2021-02-23 01:02:00
Remember all 3 above awk
time functions.
Now let's see what they do one by as used in the answer:
$ awk 'BEGIN{ print systime()-86400 }'
1614014848
Notice the 86400
is the number of seconds in each day or 24hours; at above we said systime()
return current time of day as the number of seconds since the Epoch, so if we minus seconds of a day from current time it gives us the time with yesterday date.
Let's convert it to human readable to see what is that:
$ awk 'BEGIN{ print strftime("%Y %m %d 00 00 00", systime()-86400); }'
2021 02 22 00 00 00
now it's clear what timestamp it is, we used Hour/Min/Sec to "00" because we need this timestamp as start point and we store it into start
variable in the code.
then we used a for-loop to generate the rest of the timestamps from the timestamp in the start
variable as following:
for(min=0; min<1440; min++)
timestamp[strftime("%F %H:%M", mktime(start)+min*60)]
Notice the number 1440
? that is the number of minutes in a day or 24hours (24*60=1440); but mktime()
accept timestamp as epoch and in seconds, so we multiply each minute to 60 to get timestamps in seconds then turn it into this format %F %H:%M
(F
ull format of the date same as %Y-%m-%d
, H
our and M
inute) and save into an awk array we name it timestamp[...]
; now we have yesterday's date of all timestamps minutely.
you can even print them to see what they are:
$ awk '
BEGIN{
start=strftime("%Y %m %d 00 00 00", systime()-86400);
for(min=0; min<1440; min++)
timestamp[strftime("%F %H:%M", mktime(start)+min*60)];
for (t in timestamp)
print t
}'
Below gsub() function removes all quotes from the current line:
{ gsub(/"/,"") }
then we backup the input file's first line which is the header line into hdr
variable as we need the header line to be added into every file we generate; then also we create a directory with yesterday date as well and it will take the format of dir_%Y%m%d
; below code block runs once only when it's first input line NR==1 { "run these" }
:
NR==1{
hdr=$0; yday=strftime("dir_%Y%m%d", systime()-86400);
system("mkdir "yday); next
}
With the system() function we are calling the external command mkdir
to create that directory.
Going into next block, run the following block only if timestamp from the first column was seen in the timestamp
array (substr($1,1,16) in timestamp) { "run these" }
; substr(string, start [, length ]) function return a length-character-long substring of string, starting at character number start.
cp=$1
: we copy first column intocp
variable, we will use value in thecp
for later processing.gsub(/[-: ]|00$/, "", cp);
; strip characters-
,:
and Space from thecp
variable as well as the trailing double zero "00"s.print hdr ORS $0 >(yday"/"cp".csv");
:print
the header line which we keep it in thehdr
var, anORS
(that's a newline character for Output Record Separator by default) and entire line$0
into the relateddirectory/fileName.csv
.close(yday"/"cp".csv");
: close() the file after write.delete timestamp[substr($1,1,16)]
: and delete that timestamp from the array.
and in the END { "run these" }
block we print to the files for those timestamps that didn't exist in the input file.
To process multiple files and split each input file into individual day directory.
gawk -F, '
{ gsub(/"/,"") }
FNR==1{
delete timestamp;
start=strftime("%Y %m %d 00 00 00", systime()-86400);
for(min=0; min<1440; min++)
timestamp[strftime("%F %H:%M", mktime(start)+min*60)]
hdr=$0; yday=strftime("%Y%m%d", systime()-86400);
fname=FILENAME; sub(/\.csv$/,"", fname); dirName=fname"_"yday;
system("mkdir "dirName); next
}
(substr($1,1,16) in timestamp){
cp=$1; gsub(/[-: ]|00$/, "", cp);
print hdr ORS $0 >(dirName"/"cp".csv");
close(dirName"/"cp".csv");
delete timestamp[substr($1,1,16)] }
ENDFILE{ for (x in timestamp){
cpx=x; gsub(/[-: ]/, "", cpx);
print hdr ORS x ",0,0,0,0" >(dirName"/"cpx".csv");
close(dirName"/"cpx".csv")
}
}' multiple*.csv