chenxixian
/
hive-testbench
mirror of https://github.com/hortonworks/hive-testbench.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
							

#!/usr/bin/bash
databaseName="$1"
filename="$2"
runName="$3"

DEFAULT=default  
samplingrate=1


startDstatsCollection () {
   if [ -z "$1" ]                           # Is parameter #1 zero length?
   then
     echo "-Parameter #1 run path is zero length.-"  # Or no parameter passed.
     return 0
   fi

   if [ -z "$2" ]                           # Is parameter #1 zero length?
   then
     echo "-Parameter #2 collection name is zero length.-"  # Or no parameter passed.
     return 0
   fi

   dstatoufileextension="csv"
   dstatoutdir=${1-$DEFAULT}                   	

   collectionname=${2-$DEFAULT}                   			
   collectioncsvfile="$collectionname.csv"

while read -r machinename
do                                            
	echo "Start dstat on $machinename"
	collectioncsvfile="$machinename.$collectionname.csv"
	outfile="$dstatoutdir/$collectioncsvfile"
	sudo ssh $machinename  "mkdir -p \"$dstatoutdir\" ;" < /dev/null 
	sudo ssh $machinename  "dstat -t -a --output $outfile $samplingrate > /dev/null & " < /dev/null 

done < dn.txt
}

stopDstatsCollection () {

	while read -r machinename
	do                                            
		echo $machinename
		echo "Stop dstat on $machinename"
        #ssh -i $HOME/cloud.key cloud-user@$machinename "nohup ps aux | grep dstat | awk '{print \$2}' | xargs kill > /dev/null 2>&1 & " < /dev/null  
        #ssh -i $HOME/cloud.key cloud-user@$machinename "ps aux | grep dstat | awk '{print \$2}' | xargs kill > /dev/null & " & < /dev/null  
	sudo ssh $machinename "ps aux | grep dstat | awk '{print \$2}' | xargs kill > /dev/null & " & < /dev/null
        #ssh -n -o BatchMode=yes $HOME/cloud.key cloud-user@$machinename "/home/cloud-user/impala-tpcds-kit/queries/stopdstat.sh"   2>&1 &
        #ssh -i $HOME/cloud.key cloud-user@$machinename 'bash -s' < "/home/cloud-user/impala-tpcds-kit/queries/stopdstat.sh"   < /dev/null 
	done < dn.txt
}

consLogFile () {

  if [ -z "$1" ]                           
   then
     echo "-Parameter #1 run path is zero length.-"  
     return 0
   fi

   headmachine=$(cat dn.txt | head -n 1)

   sourcedir=${1-$DEFAULT}                   	
   	while read -r machinename
	do                                            
		 if [ "$headmachine" = "$machinename" ]
		 then 
			continue
		 fi	
	
		echo "Copy from $machinename $sourcedir"
		sudo scp -rp $machinename:$sourcedir $baseoutdir
	done < dn.txt
}

summarizedstats () {

   if [ -z "$1" ]                           # Is parameter #1 zero length?
   then
     echo "-Parameter #1 run path is zero length.-"  # Or no parameter passed.
     return 0
   fi

   if [ -z "$2" ]                           # Is parameter #1 zero length?
   then
     echo "-Parameter #2 collection name is zero length.-"  # Or no parameter passed.
     return 0
   fi
   
   dstatfilesdir=${1-$DEFAULT}                   	
   dstatsummaryfile=${2-$DEFAULT}          
   echo $dstatfilesdir
   echo $dstatsummaryfile

   echo "machineName,query,executionNumber,elapsedTimeSecs,avgCpuUser,avgCpuSys,avgDiskReadBytes/sec,avgDiskWriteBytes/sec,avgNetworkReadBytes/sec,avgNetworkWriteBytes/sec,totalCpuUser,totalCpuSys,totalDiskReadBytes,totalDiskWriteBytes,totalNetworkReadBytes,totalNetworkWriteBytes,filename" > $dstatsummaryfile   

   cd $dstatfilesdir
   
	for filename in `ls *csv`
		do

		echo "Summarizing on $filename..."

		# Some book keeping
		parsecsvdata=0
		sampleCount=0

		# Avg counts
		avgCpuUser=0
		avgCpusys=0
		avgDiskRead=0
		avgDiskWrite=0
		avgNetworkRead=0
		avgNetworkWrite=0

		# total counts	
		totCpuUser=0
		totCpusys=0
		totdiskRead=0
		totdiskWrite=0
		totnetworkRead=0
		totnetworkWrite=0

		# Set "," as the field separator using $IFS
		# and read line by line using while read combo 
		while IFS=',' read -r c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12
		do
	 
		 if [ "$c1" = "\"date/time\"" ]
		 then 
			parsecsvdata=1
			continue
		 fi	
	 
		 if [ $parsecsvdata -eq 1 ]
		 then
			let sampleCount=$sampleCount+1
			totCpuUser=$(echo "scale=4; $totCpuUser+$c2" | bc)	
			totCpusys=$(echo "scale=4; $totCpusys+$c3" | bc)	
			totdiskRead=$(echo "scale=4; $totdiskRead+$c8" | bc)	
			totdiskWrite=$(echo "scale=4; $totdiskWrite+$c9" | bc)	
			totnetworkRead=$(echo "scale=4; $totnetworkRead+$c10" | bc)	
			totnetworkWrite=$(echo "scale=4; $totnetworkWrite+$c11" | bc)	
			#echo "$totCpuUser  $totCpusys $totdiskRead $totdiskWrite $totnetworkRead $totnetworkWrite  $c1 $c2 $c3 $c4 $c5 $c6 $c7"
		 fi	
		done < "$filename"
	
		# Get the averages
			avgCpuUser=$(echo "scale=4; $totCpuUser/$sampleCount" | bc)	
			avgCpusys=$(echo "scale=4; $totCpusys/$sampleCount" | bc)	
			avgDiskRead=$(echo "scale=4; $totdiskRead/$sampleCount" | bc)	
			avgDiskWrite=$(echo "scale=4; $totdiskWrite/$sampleCount" | bc)	
			avgNetworkRead=$(echo "scale=4; $totnetworkRead/$sampleCount" | bc)	
			avgNetworkWrite=$(echo "scale=4; $totnetworkWrite/$sampleCount" | bc)	
	
		# Parse the test metadata from the file name
		IFS=. read -a testinfo <<<"$filename"
		machineName="${testinfo[0]}"
		query="${testinfo[1]}"
		executionNumber="${testinfo[3]}"
	
		echo "$machineName,$query,$executionNumber,$sampleCount,$avgCpuUser,$avgCpusys,$avgDiskRead,$avgDiskWrite,$avgNetworkRead,$avgNetworkWrite,$totCpuUser,$totCpusys,$totdiskRead,$totdiskWrite,$totnetworkRead,$totnetworkWrite,$filename" >> $dstatsummaryfile
done
}

loopCount="4"
timeStamp="$(date "+%s-%d-%H-%M-%S")"
baseoutdir="$runName/$databaseName"
outDir="$baseoutdir/$timeStamp"
explainPlans="$outDir/explainPlans"
queryOutPut="$outDir/queryOutPut"
profileOutPut="$outDir/queryProfile"
tempQueries="$outDir/tempQueries"
masterLogFile="$outDir/masterlog.txt"
executionTimeFile="$outDir/summary.csv"
currentDir=$(pwd)
dstatCountersDir="$currentDir/$outDir/dstats"
resourcesummaryfile="$currentDir/$outDir/resourceSummary.csv"


mkdir -p "$outDir"
mkdir -p "$tempQueries"
mkdir -p "$queryOutPut"
mkdir -p "$explainPlans"
mkdir -p "$profileOutPut"
mkdir -p "$dstatCountersDir"

echo "runName,databaseName,queryName,executionNumber,responseTime,rows(s),jobID,executionTime" > $executionTimeFile

explainString="explain "
exitString="exit;"

while read -r line
do
    name=$line
    queryName=$line
    queryFileName="$tempQueries/$queryName.explain.txt"
    outFileName="$explainPlans/$queryName.explain.out"

    echo "$explainString " | cat - $line  > "$queryFileName"
    echo "$exitString" >> "$queryFileName"
	
	echo "$(date "+%m-%d-%Y %T") : Starting $queryFileName" 
    echo "$(date "+%m-%d-%Y %T") : Starting $queryFileName" >> $masterLogFile
	START=$(($(date +%s%N)/1000000))
    
    hive --database $databaseName -i "$queryFileName" > "$outFileName"  2>&1
    #impala-shell -i impala2 -V -B -d $databaseName -f "$queryFileName" > "$outFileName"  2>&1
    
    END=$(($(date +%s%N)/1000000))
    ET=$((END-START))
    
    echo "$runName,$databaseName,$queryName.explain,explain,$ET" >> $executionTimeFile
	echo "$(date "+%m-%d-%Y %T") : End $outDir/$line.explain.txt" 
    echo "$(date "+%m-%d-%Y %T") : End $outDir/$line.explain.txt" >> $masterLogFile
    
    for i in $(seq 1 1 $loopCount)
    do
  
       queryFileName="$tempQueries/$queryName.$i.txt"
       outFileName="$queryOutPut/$queryName.$i.out"
       outProfileName="$profileOutPut/$queryName.$i.out"

	   cat  $line  > "$queryFileName"
       echo "$exitString" >> "$queryFileName"

	   # Run queries without profiling
	   echo "$(date "+%m-%d-%Y %T") : Starting $queryFileName" 
       echo "$(date "+%m-%d-%Y %T") : Starting $queryName.$i.txt" >> $masterLogFile

        # start data trace
		startDstatsCollection $dstatCountersDir "$queryName.$i"

       START=$(($(date +%s%N)/1000000))

	   hive --database $databaseName -e "`cat $queryFileName`" > "$outFileName"  2>&1
	   #impala-shell -i impala2 -V -B -d $databaseName -f "$queryFileName" > "$outFileName"  2>&1

       END=$(($(date +%s%N)/1000000))
       ET=$((END-START))

		stopDstatsCollection 

       responsetime=$(cat $outFileName | grep "Time taken:" | grep "seconds" | awk '{print $3}')
       returnedrowcount=$(cat $outFileName | grep "Time taken:" | grep "row(s)" | awk '{print $63}')
       jobid=$(cat $outFileName | grep "Query ID" | awk '{print $4}')


       echo "$queryName.$i completed $ET msec" 
       echo "$runName,$databaseName,$queryName,$i,$responsetime,$returnedrowcount,$jobid,$ET" >> $executionTimeFile
       echo "$(date "+%m-%d-%Y %T") : End $queryName.$i.txt" >> $masterLogFile
       

    done

done < "$filename"


# Copy log files from all machines

consLogFile $currentDir/$outDir

summarizedstats $dstatCountersDir $resourcesummaryfile