| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- #!/usr/bin/bash
- databaseName="$1"
- filename="$2"
- runName="$3"
- DEFAULT=default
- samplingrate=1
- startDstatsCollection () {
- if [ -z "$1" ] # Is parameter #1 zero length?
- then
- echo "-Parameter #1 run path is zero length.-" # Or no parameter passed.
- return 0
- fi
- if [ -z "$2" ] # Is parameter #1 zero length?
- then
- echo "-Parameter #2 collection name is zero length.-" # Or no parameter passed.
- return 0
- fi
- dstatoufileextension="csv"
- dstatoutdir=${1-$DEFAULT}
- collectionname=${2-$DEFAULT}
- collectioncsvfile="$collectionname.csv"
- while read -r machinename
- do
- echo "Start dstat on $machinename"
- collectioncsvfile="$machinename.$collectionname.csv"
- outfile="$dstatoutdir/$collectioncsvfile"
- sudo ssh $machinename "mkdir -p \"$dstatoutdir\" ;" < /dev/null
- sudo ssh $machinename "dstat -t -a --output $outfile $samplingrate > /dev/null & " < /dev/null
- done < dn.txt
- }
- stopDstatsCollection () {
- while read -r machinename
- do
- echo $machinename
- echo "Stop dstat on $machinename"
- #ssh -i $HOME/cloud.key cloud-user@$machinename "nohup ps aux | grep dstat | awk '{print \$2}' | xargs kill > /dev/null 2>&1 & " < /dev/null
- #ssh -i $HOME/cloud.key cloud-user@$machinename "ps aux | grep dstat | awk '{print \$2}' | xargs kill > /dev/null & " & < /dev/null
- sudo ssh $machinename "ps aux | grep dstat | awk '{print \$2}' | xargs kill > /dev/null & " & < /dev/null
- #ssh -n -o BatchMode=yes $HOME/cloud.key cloud-user@$machinename "/home/cloud-user/impala-tpcds-kit/queries/stopdstat.sh" 2>&1 &
- #ssh -i $HOME/cloud.key cloud-user@$machinename 'bash -s' < "/home/cloud-user/impala-tpcds-kit/queries/stopdstat.sh" < /dev/null
- done < dn.txt
- }
- consLogFile () {
- if [ -z "$1" ]
- then
- echo "-Parameter #1 run path is zero length.-"
- return 0
- fi
- headmachine=$(cat dn.txt | head -n 1)
- sourcedir=${1-$DEFAULT}
- while read -r machinename
- do
- if [ "$headmachine" = "$machinename" ]
- then
- continue
- fi
-
- echo "Copy from $machinename $sourcedir"
- sudo scp -rp $machinename:$sourcedir $baseoutdir
- done < dn.txt
- }
- summarizedstats () {
- if [ -z "$1" ] # Is parameter #1 zero length?
- then
- echo "-Parameter #1 run path is zero length.-" # Or no parameter passed.
- return 0
- fi
- if [ -z "$2" ] # Is parameter #1 zero length?
- then
- echo "-Parameter #2 collection name is zero length.-" # Or no parameter passed.
- return 0
- fi
-
- dstatfilesdir=${1-$DEFAULT}
- dstatsummaryfile=${2-$DEFAULT}
- echo $dstatfilesdir
- echo $dstatsummaryfile
- echo "machineName,query,executionNumber,elapsedTimeSecs,avgCpuUser,avgCpuSys,avgDiskReadBytes/sec,avgDiskWriteBytes/sec,avgNetworkReadBytes/sec,avgNetworkWriteBytes/sec,totalCpuUser,totalCpuSys,totalDiskReadBytes,totalDiskWriteBytes,totalNetworkReadBytes,totalNetworkWriteBytes,filename" > $dstatsummaryfile
- cd $dstatfilesdir
-
- for filename in `ls *csv`
- do
- echo "Summarizing on $filename..."
- # Some book keeping
- parsecsvdata=0
- sampleCount=0
- # Avg counts
- avgCpuUser=0
- avgCpusys=0
- avgDiskRead=0
- avgDiskWrite=0
- avgNetworkRead=0
- avgNetworkWrite=0
- # total counts
- totCpuUser=0
- totCpusys=0
- totdiskRead=0
- totdiskWrite=0
- totnetworkRead=0
- totnetworkWrite=0
- # Set "," as the field separator using $IFS
- # and read line by line using while read combo
- while IFS=',' read -r c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12
- do
-
- if [ "$c1" = "\"date/time\"" ]
- then
- parsecsvdata=1
- continue
- fi
-
- if [ $parsecsvdata -eq 1 ]
- then
- let sampleCount=$sampleCount+1
- totCpuUser=$(echo "scale=4; $totCpuUser+$c2" | bc)
- totCpusys=$(echo "scale=4; $totCpusys+$c3" | bc)
- totdiskRead=$(echo "scale=4; $totdiskRead+$c8" | bc)
- totdiskWrite=$(echo "scale=4; $totdiskWrite+$c9" | bc)
- totnetworkRead=$(echo "scale=4; $totnetworkRead+$c10" | bc)
- totnetworkWrite=$(echo "scale=4; $totnetworkWrite+$c11" | bc)
- #echo "$totCpuUser $totCpusys $totdiskRead $totdiskWrite $totnetworkRead $totnetworkWrite $c1 $c2 $c3 $c4 $c5 $c6 $c7"
- fi
- done < "$filename"
-
- # Get the averages
- avgCpuUser=$(echo "scale=4; $totCpuUser/$sampleCount" | bc)
- avgCpusys=$(echo "scale=4; $totCpusys/$sampleCount" | bc)
- avgDiskRead=$(echo "scale=4; $totdiskRead/$sampleCount" | bc)
- avgDiskWrite=$(echo "scale=4; $totdiskWrite/$sampleCount" | bc)
- avgNetworkRead=$(echo "scale=4; $totnetworkRead/$sampleCount" | bc)
- avgNetworkWrite=$(echo "scale=4; $totnetworkWrite/$sampleCount" | bc)
-
- # Parse the test metadata from the file name
- IFS=. read -a testinfo <<<"$filename"
- machineName="${testinfo[0]}"
- query="${testinfo[1]}"
- executionNumber="${testinfo[3]}"
-
- echo "$machineName,$query,$executionNumber,$sampleCount,$avgCpuUser,$avgCpusys,$avgDiskRead,$avgDiskWrite,$avgNetworkRead,$avgNetworkWrite,$totCpuUser,$totCpusys,$totdiskRead,$totdiskWrite,$totnetworkRead,$totnetworkWrite,$filename" >> $dstatsummaryfile
- done
- }
- loopCount="4"
- timeStamp="$(date "+%s-%d-%H-%M-%S")"
- baseoutdir="$runName/$databaseName"
- outDir="$baseoutdir/$timeStamp"
- explainPlans="$outDir/explainPlans"
- queryOutPut="$outDir/queryOutPut"
- profileOutPut="$outDir/queryProfile"
- tempQueries="$outDir/tempQueries"
- masterLogFile="$outDir/masterlog.txt"
- executionTimeFile="$outDir/summary.csv"
- currentDir=$(pwd)
- dstatCountersDir="$currentDir/$outDir/dstats"
- resourcesummaryfile="$currentDir/$outDir/resourceSummary.csv"
- mkdir -p "$outDir"
- mkdir -p "$tempQueries"
- mkdir -p "$queryOutPut"
- mkdir -p "$explainPlans"
- mkdir -p "$profileOutPut"
- mkdir -p "$dstatCountersDir"
- echo "runName,databaseName,queryName,executionNumber,responseTime,rows(s),jobID,executionTime" > $executionTimeFile
- explainString="explain "
- exitString="exit;"
- while read -r line
- do
- name=$line
- queryName=$line
- queryFileName="$tempQueries/$queryName.explain.txt"
- outFileName="$explainPlans/$queryName.explain.out"
- echo "$explainString " | cat - $line > "$queryFileName"
- echo "$exitString" >> "$queryFileName"
-
- echo "$(date "+%m-%d-%Y %T") : Starting $queryFileName"
- echo "$(date "+%m-%d-%Y %T") : Starting $queryFileName" >> $masterLogFile
- START=$(($(date +%s%N)/1000000))
-
- hive --database $databaseName -i "$queryFileName" > "$outFileName" 2>&1
- #impala-shell -i impala2 -V -B -d $databaseName -f "$queryFileName" > "$outFileName" 2>&1
-
- END=$(($(date +%s%N)/1000000))
- ET=$((END-START))
-
- echo "$runName,$databaseName,$queryName.explain,explain,$ET" >> $executionTimeFile
- echo "$(date "+%m-%d-%Y %T") : End $outDir/$line.explain.txt"
- echo "$(date "+%m-%d-%Y %T") : End $outDir/$line.explain.txt" >> $masterLogFile
-
- for i in $(seq 1 1 $loopCount)
- do
-
- queryFileName="$tempQueries/$queryName.$i.txt"
- outFileName="$queryOutPut/$queryName.$i.out"
- outProfileName="$profileOutPut/$queryName.$i.out"
- cat $line > "$queryFileName"
- echo "$exitString" >> "$queryFileName"
- # Run queries without profiling
- echo "$(date "+%m-%d-%Y %T") : Starting $queryFileName"
- echo "$(date "+%m-%d-%Y %T") : Starting $queryName.$i.txt" >> $masterLogFile
- # start data trace
- startDstatsCollection $dstatCountersDir "$queryName.$i"
- START=$(($(date +%s%N)/1000000))
- hive --database $databaseName -e "`cat $queryFileName`" > "$outFileName" 2>&1
- #impala-shell -i impala2 -V -B -d $databaseName -f "$queryFileName" > "$outFileName" 2>&1
- END=$(($(date +%s%N)/1000000))
- ET=$((END-START))
- stopDstatsCollection
- responsetime=$(cat $outFileName | grep "Time taken:" | grep "seconds" | awk '{print $3}')
- returnedrowcount=$(cat $outFileName | grep "Time taken:" | grep "row(s)" | awk '{print $63}')
- jobid=$(cat $outFileName | grep "Query ID" | awk '{print $4}')
- echo "$queryName.$i completed $ET msec"
- echo "$runName,$databaseName,$queryName,$i,$responsetime,$returnedrowcount,$jobid,$ET" >> $executionTimeFile
- echo "$(date "+%m-%d-%Y %T") : End $queryName.$i.txt" >> $masterLogFile
-
- done
- done < "$filename"
- # Copy log files from all machines
- consLogFile $currentDir/$outDir
- summarizedstats $dstatCountersDir $resourcesummaryfile
|