/* SFP code: SFP is a Rectilinear Steiner Minimum Tree (RSMT) heuristic. It accepts inputs in the ISPD'08 format described at http://www.ispd.cc/contests/08/ispd08rc.html. Copyright 2019-2022 Texas State University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Martin Burtscher, Aarti Kothari, and Alex Fallin URL: The latest version of this code is available at https://cs.txstate.edu/~burtscher/research/SFP/. */ #include #include #include #include #include #include #include #include #include static const int MaxPins = 256; // must be a power of 2 static const int WS = 32; // warp size using ID = short; // must be signed (point and edge IDs) using ctype = int; // must be signed (coordinates and distances) // change requires further change below typedef struct { ID src; ID dst; } edge; static __device__ int currpos1 = 0; static __device__ int currpos2 = 0; static __device__ int wlsize = 0; template static __device__ void buildMST(const ID num, const ctype* const __restrict__ x, const ctype* const __restrict__ y, edge* const __restrict__ edges, volatile ctype dist[PinLimit]) { __shared__ volatile ID source[WarpsPerBlock][PinLimit]; __shared__ volatile ID destin[WarpsPerBlock][PinLimit]; __shared__ volatile ctype mindj[WarpsPerBlock]; const int lane = threadIdx.x % WS; const int warp = threadIdx.x / WS; // initialize ID numItems = num - 1; for (ID i = lane; i < numItems; i += WS) dist[i] = INT_MAX; // change if ctype changed for (ID i = lane; i < numItems; i += WS) destin[warp][i] = (ID)(i + 1); // Prim's MST algorithm ID src = 0; for (ID cnt = 0; cnt < num - 1; cnt++) { __syncwarp(); if (lane == 0) mindj[warp] = INT_MAX; // update distances __syncwarp(); for (ID j = lane; j < numItems; j += WS) { const ID dst = destin[warp][j]; const ctype dnew = abs(x[src] - x[dst]) + abs(y[src] - y[dst]); ctype d = dist[j]; if (d > dnew) { d = dnew; dist[j] = dnew; source[warp][j] = src; } const int upv = d * (MaxPins * 2) + j; // tie breaker for determinism atomicMin((ctype*)&mindj[warp], upv); } // create new edge __syncwarp(); const ID j = mindj[warp] % (MaxPins * 2); src = destin[warp][j]; numItems--; if (lane == 0) { edges[cnt].src = source[warp][j]; edges[cnt].dst = src; dist[j] = dist[numItems]; source[warp][j] = source[warp][numItems]; destin[warp][j] = destin[warp][numItems]; } } } template static __device__ bool insertSteinerPoints(ID& num, ctype* const __restrict__ x, ctype* const __restrict__ y, const edge* const __restrict__ edges, volatile ctype dist[PinLimit]) { __shared__ volatile ID adj[WarpsPerBlock][PinLimit][8]; __shared__ int cnt[WarpsPerBlock][PinLimit]; const int lane = threadIdx.x % WS; const int warp = threadIdx.x / WS; const ID top = num; // create adjacency lists for (ID i = lane; i < top; i += WS) cnt[warp][i] = 0; __syncwarp(); for (ID e = lane; e < top - 1; e += WS) { dist[e] = -1; const ID s = edges[e].src; const ID d = edges[e].dst; if ((x[d] != x[s]) || (y[d] != y[s])) { const int ps = atomicAdd(&cnt[warp][s], 1); adj[warp][s][ps] = e; const int pd = atomicAdd(&cnt[warp][d], 1); adj[warp][d][pd] = e; } } // find best distance for each triangle __syncwarp(); for (ID s = lane; s < top; s += WS) { if (cnt[warp][s] >= 2) { const ctype x0 = x[s]; const ctype y0 = y[s]; for (char j = 0; j < cnt[warp][s] - 1; j++) { const ID e1 = adj[warp][s][j]; const ID d1 = (s != edges[e1].src) ? edges[e1].src : edges[e1].dst; const ctype x1 = x[d1]; const ctype y1 = y[d1]; for (char k = j + 1; k < cnt[warp][s]; k++) { const ID e2 = adj[warp][s][k]; const ID d2 = (s != edges[e2].src) ? edges[e2].src : edges[e2].dst; const ctype stx = max(min(x0, x1), min(max(x0, x1), x[d2])); const ctype sty = max(min(y0, y1), min(max(y0, y1), y[d2])); const ctype rd = abs(stx - x0) + abs(sty - y0); if (rd > 0) { const ctype rd1 = rd * (MaxPins * 2) + e1; // tie breaker const ctype rd2 = rd * (MaxPins * 2) + e2; // tie breaker atomicMax((ctype*)&dist[e1], rd2); atomicMax((ctype*)&dist[e2], rd1); } } } } } // process "triangles" to find best candidate Steiner points __syncwarp(); bool updated = false; for (ID e1 = lane; __any_sync(0xffffffff, e1 < top - 2); e1 += WS) { bool insert = false; ctype stx, sty; if (e1 < top - 2) { const ctype d1 = dist[e1]; if (d1 > 0) { const ID e2 = d1 % (MaxPins * 2); if (e2 > e1) { const ctype d2 = dist[e2]; if (e1 == d2 % (MaxPins * 2)) { const ctype x0 = x[edges[e1].src]; const ctype y0 = y[edges[e1].src]; const ctype x1 = x[edges[e1].dst]; const ctype y1 = y[edges[e1].dst]; ctype x2 = x[edges[e2].src]; ctype y2 = y[edges[e2].src]; if (((x2 == x0) && (y2 == y0)) || ((x2 == x1) && (y2 == y1))) { x2 = x[edges[e2].dst]; y2 = y[edges[e2].dst]; } updated = true; insert = true; stx = max(min(x0, x1), min(max(x0, x1), x2)); sty = max(min(y0, y1), min(max(y0, y1), y2)); } } } } const int bal = __ballot_sync(0xffffffff, insert); const int pos = __popc(bal & ~(-1 << lane)) + num; if (insert) { x[pos] = stx; y[pos] = sty; } num += __popc(bal); } return __any_sync(0xffffffff, updated); } template static __device__ inline void processSmallNet(const int i, const int* const __restrict__ idxin, const ctype* const __restrict__ xin, const ctype* const __restrict__ yin, int* const __restrict__ idxout, ctype* const __restrict__ xout, ctype* const __restrict__ yout, edge* const __restrict__ edges, int* const __restrict__ wl) { __shared__ volatile ctype dist[WarpsPerBlock][PinLimit]; const int lane = threadIdx.x % WS; const int warp = threadIdx.x / WS; // initialize arrays and copy input coords to output const int pin = idxin[i]; const ID num = idxin[i + 1] - pin; const int pout = 2 * pin; if (lane == 0) idxout[i] = pout; for (ID j = lane; j < num; j += WS) xout[pout + j] = xin[pin + j]; for (ID j = lane; j < num; j += WS) yout[pout + j] = yin[pin + j]; // process nets if (num == 2) { if (lane == 0) edges[pout] = edge{0, 1}; } else if (num == 3) { ctype x0, y0; if (lane < 3) { edges[pout + lane] = edge{(short)lane, 3}; x0 = xout[pout + lane]; y0 = yout[pout + lane]; } const ctype x1 = __shfl_sync(0xffffffff, x0, 1); const ctype y1 = __shfl_sync(0xffffffff, y0, 1); const ctype x2 = __shfl_sync(0xffffffff, x0, 2); const ctype y2 = __shfl_sync(0xffffffff, y0, 2); if (lane == 0) { xout[pout + 3] = max(min(x0, x1), min(max(x0, x1), x2)); yout[pout + 3] = max(min(y0, y1), min(max(y0, y1), y2)); } } else if (num <= 32) { // iterate until all Steiner points added ID cnt = num; do { buildMST(cnt, &xout[pout], &yout[pout], &edges[pout], dist[warp]); } while (insertSteinerPoints(cnt, &xout[pout], &yout[pout], &edges[pout], dist[warp])); } else { if (lane == 0) wl[atomicAdd(&wlsize, 1)] = i; } } template static __device__ inline void processLargeNet(const int i, const int* const __restrict__ idxin, ctype* const __restrict__ xout, ctype* const __restrict__ yout, edge* const __restrict__ edges) { __shared__ volatile ctype dist[WarpsPerBlock][PinLimit]; const int warp = threadIdx.x / WS; const int pin = idxin[i]; const ID num = idxin[i + 1] - pin; const int pout = 2 * pin; // iterate until all Steiner points added ID cnt = num; do { buildMST(cnt, &xout[pout], &yout[pout], &edges[pout], dist[warp]); } while (insertSteinerPoints(cnt, &xout[pout], &yout[pout], &edges[pout], dist[warp])); } template static __global__ __launch_bounds__(WarpsPerBlock * WS, 2) void largeNetKernel(const int* const __restrict__ idxin, const ctype* const __restrict__ xin, const ctype* const __restrict__ yin, int* const __restrict__ idxout, ctype* __restrict__ xout, ctype* __restrict__ yout, edge* __restrict__ edges, const int numnets, int* const __restrict__ wl) { // compute Steiner points and edges const int lane = threadIdx.x % WS; do { int i; if (lane == 0) i = atomicAdd(&currpos1, 1); i = __shfl_sync(0xffffffff, i, 0); if (i >= numnets) break; processSmallNet(i, idxin, xin, yin, idxout, xout, yout, edges, wl); } while (true); // set final element if ((threadIdx.x == 0) && (blockIdx.x == 0)) { idxout[numnets] = 2 * idxin[numnets]; } } template static __global__ __launch_bounds__(WarpsPerBlock * WS, 2) void smallNetKernel(const int* const __restrict__ idxin, ctype* __restrict__ xout, ctype* __restrict__ yout, edge* __restrict__ edges, int* const __restrict__ wl) { // compute Steiner points and edges const int lane = threadIdx.x % WS; do { int i; if (lane == 0) i = atomicAdd(&currpos2, 1); i = __shfl_sync(0xffffffff, i, 0); if (i >= wlsize) break; processLargeNet(wl[i], idxin, xout, yout, edges); } while (true); } static void computeRSMT(const int* const __restrict__ idxin, const ctype* const __restrict__ xin, const ctype* const __restrict__ yin, int* const __restrict__ idxout, ctype* const __restrict__ xout, ctype* const __restrict__ yout, edge* const __restrict__ edges, const int numnets) { // obtain GPU info cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); const int SMs = deviceProp.multiProcessorCount; const int mTpSM = deviceProp.maxThreadsPerMultiProcessor; printf("GPU: %s with %d SMs and %d mTpSM (%.1f MHz and %.1f MHz)\n", deviceProp.name, SMs, mTpSM, deviceProp.clockRate * 0.001, deviceProp.memoryClockRate * 0.001); const int blocks = SMs * 2; printf("launching %d thread blocks with %d threads (%d threads)\n", blocks, 24 * WS, blocks * 24 * WS); // allocate and initialize GPU memory int* d_idxin; ctype* d_xin; ctype* d_yin; int* d_idxout; ctype* d_xout; ctype* d_yout; edge* d_edges; int* d_wl; const int size = idxin[numnets]; cudaMalloc((void **)&d_idxin, (numnets + 1) * sizeof(int)); cudaMalloc((void **)&d_xin, size * sizeof(ctype)); cudaMalloc((void **)&d_yin, size * sizeof(ctype)); cudaMalloc((void **)&d_idxout, (numnets + 1) * sizeof(int)); cudaMalloc((void **)&d_xout, 2 * size * sizeof(ctype)); cudaMalloc((void **)&d_yout, 2 * size * sizeof(ctype)); cudaMalloc((void **)&d_edges, 2 * size * sizeof(edge)); cudaMalloc((void **)&d_wl, numnets * sizeof(int)); cudaMemcpy(d_idxin, idxin, (numnets + 1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_xin, xin, size * sizeof(ctype), cudaMemcpyHostToDevice); cudaMemcpy(d_yin, yin, size * sizeof(ctype), cudaMemcpyHostToDevice); // start time timeval start, end; gettimeofday(&start, NULL); // process nets cudaMemset(d_xout, -1, 2 * size * sizeof(ctype)); cudaMemset(d_yout, -1, 2 * size * sizeof(ctype)); cudaMemset(d_edges, 0, 2 * size * sizeof(edge)); largeNetKernel<24, 64><<>>(d_idxin, d_xin, d_yin, d_idxout, d_xout, d_yout, d_edges, numnets, d_wl); smallNetKernel<3, 512><<>>(d_idxin, d_xout, d_yout, d_edges, d_wl); // end time cudaDeviceSynchronize(); gettimeofday(&end, NULL); const double runtime = end.tv_sec - start.tv_sec + (end.tv_usec - start.tv_usec) / 1000000.0; printf("compute time: %.6f s\n", runtime); printf("throughput: %.f nets/sec\n", numnets / runtime); // debug only cudaError_t e; cudaDeviceSynchronize(); if (cudaSuccess != (e = cudaGetLastError())) {printf("CUDA ERROR %d: %s\n", e, cudaGetErrorString(e)); exit(-1);} // transfer results from GPU cudaMemcpy(idxout, d_idxout, (numnets + 1) * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(xout, d_xout, 2 * size * sizeof(ctype), cudaMemcpyDeviceToHost); cudaMemcpy(yout, d_yout, 2 * size * sizeof(ctype), cudaMemcpyDeviceToHost); cudaMemcpy(edges, d_edges, 2 * size * sizeof(edge), cudaMemcpyDeviceToHost); // clean up cudaFree(d_wl); cudaFree(d_edges); cudaFree(d_yout); cudaFree(d_xout); cudaFree(d_idxout); cudaFree(d_yin); cudaFree(d_xin); cudaFree(d_idxin); } static ctype treeLength(const ID num, const ctype* const __restrict__ x, const ctype* const __restrict__ y, const edge* const __restrict__ edges) { // compute wire length of Steiner tree ctype len = 0; for (ID i = 0; i < num - 1; i++) { const ctype x1 = x[edges[i].src]; const ctype y1 = y[edges[i].src]; const ctype x2 = x[edges[i].dst]; const ctype y2 = y[edges[i].dst]; len += abs(x1 - x2) + abs(y1 - y2); } return len; } // struct to store grid information struct grid { int grid[3]; int* vc = NULL; int* hc = NULL; int* min_wid = NULL; int* min_space = NULL; int* via_space = NULL; int llx, lly, tile_wid, tile_height; }; // struct to store net and pin information struct net_list { int num_net; std::vector >* num_net_arr = NULL; int* net_id = NULL; int* net_num_pins = NULL; int* net_min_wid = NULL; }; // function to read in input file static void read_file(const char* file, grid& g, net_list& n) { std::string line; std::string text1, text2; int line_count = 0; // for error messages std::fstream myfile(file); if (!myfile.is_open()) {std::cout << "ERROR: Cannot open input file!\n"; exit(-1);} // read grid x and y co-ordinates and number of layers getline(myfile, line); line_count++; std::stringstream data1(line); if (!(data1 >> text1 >> g.grid[0] >> g.grid[1] >> g.grid[2])) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if (text1 != "grid") {std::cout << "ERROR: Invalid format of grid!\n"; exit(-1);} for (int i = 0; i < 3; i++) { if (g.grid[i] < 1) {std::cout << "ERROR: Grid data should be a reasonable number!\n"; exit(-1);} } // read vertical capacity of each layer getline(myfile, line); line_count++; std::stringstream data2(line); g.vc = new int [g.grid[2] + 1]; if (!(data2 >> text1 >> text2)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if ((text1 != "vertical") || (text2 != "capacity")) {std::cout << "ERROR: Invalid format of g.vertical capacity!\n"; exit(-1);} for (int i = 1; i <= g.grid[2]; i++) { if (data2 >> g.vc[i]) { if (g.vc[i] < 0) {std::cout << "ERROR: vertical capacity should be a reasonable number!\n"; exit(-1);} } } // read horizontal capacity of each layer getline(myfile, line); line_count++; std::stringstream data3(line); g.hc = new int [g.grid[2] + 1]; if (!(data3 >> text1 >> text2)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n";} if ((text1 != "horizontal") || (text2 != "capacity")) {std::cout << "ERROR: Invalid format of g.horizontal capacity!\n"; exit(-1);} for (int i = 1; i <= g.grid[2]; i++) { if (data3 >> g.hc[i]) { if (g.hc[i] < 0) {std::cout << "ERROR: horizontal capacity should be a reasonable number!\n"; exit(-1);} } } // read minimum width of each layer getline(myfile, line); line_count++; std::stringstream data4(line); g.min_wid = new int [g.grid[2] + 1]; if (!(data4 >> text1 >> text2)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if ((text1 != "minimum") || (text2 != "width")) {std::cout << "ERROR: Invalid format of minimum width!\n"; exit(-1);} for (int i = 1; i <= g.grid[2] + 1; i++) { if (data4 >> g.min_wid[i]) { if (g.min_wid[i] < 1) {std::cout << "ERROR: Minimum width should be a reasonable number!\n"; exit(-1);} } } // read minimum spacing of each layer getline(myfile, line); line_count++; std::stringstream data5(line); g.min_space = new int [g.grid[2] + 1]; if (!(data5 >> text1 >> text2)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if ((text1 != "minimum") || (text2 != "spacing")) {std::cout << "ERROR: Invalid format of minimum spacing!\n"; exit(-1);} for (int i = 1; i <= g.grid[2]; i++) { if (data5 >> g.min_space[i]) { if (g.min_space[i] < 0) {std::cout << "ERROR: Minimum spacing should be a reasonable number!\n"; exit(-1);} } } // read via spacing of each layer getline(myfile, line); line_count++; std::stringstream data6(line); g.via_space = new int [g.grid[2] + 1]; if (!(data6 >> text1 >> text2)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if ((text1 != "via") || (text2 != "spacing")) {std::cout << "ERROR: Invalid format of via spacing!\n"; exit(-1);} for (int i = 1; i <= g.grid[2]; i++) { if (data6 >> g.via_space[i]) { if (g.via_space[i] < 0) {std::cout << "ERROR: Via spacing should be a reasonable number!\n"; exit(-1);} } } // read lower left x and y co-ordinates for the global routing region, tile width and tile height per layer getline(myfile, line); line_count++; std::stringstream data7(line); if (!(data7 >> g.llx >> g.lly >> g.tile_wid >> g.tile_height)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if (g.tile_wid < 1 || g.tile_height < 1) {std::cout << "ERROR: Tile width and tile height should be a reasonable number!\n"; exit(-1);} // read total number of nets do {getline(myfile, line);} while (line == ""); line_count++; std::stringstream data8(line); if (!(data8 >> text1 >> text2 >> n.num_net)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if ((text1 != "num") || (text2 != "net")) {std::cout << "ERROR: Invalid format of num net!\n"; exit(-1);} if (n.num_net < 1) {std::cout << "ERROR: Number of nets should be a reasonable number!\n";} // allocate memory n.net_id = new int [n.num_net]; n.net_num_pins = new int [n.num_net]; n.net_min_wid = new int [n.num_net]; n.num_net_arr = new std::vector > [n.num_net]; // read net name, net id, number of pins and minimum width of each net for (int i = 0; i < n.num_net; i++) { getline(myfile, line); line_count++; std::stringstream data9(line); if (!(data9 >> text1 >> n.net_id[i] >> n.net_num_pins[i] >> n.net_min_wid[i])) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if ((n.net_id[i] < 0) || (n.net_num_pins[i] < 2) || (n.net_min_wid[i] < 1)) {std::cout << "ERROR: Net ID, number of pins and min width should be a reasonable number!\n"; exit(-1);} // read x, y and layer information of each net for (int j = 0; j < n.net_num_pins[i]; j++) { getline(myfile, line); line_count++; std::stringstream data10(line); int x, y, layer; if (!(data10 >> x >> y >> layer)) {std::cout << "ERROR: Line" << line_count << " couldn't be parsed!\n"; exit(-1);} if (!(x >= g.llx && x < (g.grid[0] * g.tile_wid))) x = (g.grid[0] * g.tile_wid) - 1; if (!(y >= g.lly && y < (g.grid[1] * g.tile_height))) y = (g.grid[1] * g.tile_height) - 1; if (!(layer > 0 && layer <= g.grid[2])) {std::cout << "ERROR: layer should be within grid!\n"; exit(-1);} if ((x < 0) || (y < 0) || (x >= INT_MAX / (MaxPins * 4)) || (y >= INT_MAX / (MaxPins * 4))) {std::cout << "ERROR: x or y out of bounds\n"; exit(-1);} n.num_net_arr[i].push_back(std::make_tuple(x, y)); } } myfile.close(); } // free info from input file static void free_memory(const grid& g, const net_list& n) { delete [] g.vc; delete [] g.hc; delete [] g.min_wid; delete [] g.min_space; delete [] g.via_space; delete [] n.net_id; delete [] n.net_num_pins; delete [] n.net_min_wid; delete [] n.num_net_arr; } int main(int argc, char* argv[]) { printf("\nSFP CUDA (%s)\n", __FILE__); printf("Copyright 2019-2022 Texas State University\n\n"); // check command line if (argc != 3) {printf("USAGE: %s file_name print_results\n", argv[0]); exit(-1);} const bool print = (atoi(argv[2]) != 0); // read input file printf("reading: %s\n", argv[1]); grid g; net_list n; read_file(argv[1], g, n); const int numnets = n.num_net; // start converting data structure int* idxin = NULL; cudaHostAlloc(&idxin, (numnets + 1) * sizeof(int), cudaHostAllocDefault); if (idxin == NULL) {printf("ERROR: idxin - host memory allocation failed\n\n"); exit(-1);} idxin[0] = 0; ID hipin = 0; int pos = 0; for (int i = 0; i < numnets; i++) { const ID num = std::min(n.net_num_pins[i], MaxPins); hipin = std::max(hipin, num); pos += num; idxin[i + 1] = pos; } // print histogram int trunc = 0; int* const hist = new int [hipin + 1]; for (int i = 0; i < hipin + 1; i++) hist[i] = 0; for (int i = 0; i < numnets; i++) { const ID num = std::min(n.net_num_pins[i], MaxPins); if (num < n.net_num_pins[i]) trunc++; hist[num]++; } int sum = 0; for (int i = 0; i < hipin + 1; i++) { sum += hist[i]; if (print) printf("hist %4d: %6.2f%% %6.2f%% %d\n", i, 100.0 * hist[i] / numnets, 100.0 * sum / numnets, hist[i]); } delete [] hist; // print info if (print) printf("\n"); printf("number of nets: %d\n", numnets); printf("max pins per net: %d\n", hipin); printf("truncated nets: %d\n", trunc); if (hipin > MaxPins) {printf("ERROR: hi_pin_count must be no more than %d\n", MaxPins); exit(-1);} // copy pin coordinates ctype* xin = NULL; cudaHostAlloc(&xin, idxin[numnets] * sizeof(ctype), cudaHostAllocDefault); if (xin == NULL) {printf("ERROR: xin - host memory allocation failed\n\n"); exit(-1);} ctype* yin = NULL; cudaHostAlloc(&yin, idxin[numnets] * sizeof(ctype), cudaHostAllocDefault); if (yin == NULL) {printf("ERROR: yin - host memory allocation failed\n\n"); exit(-1);} pos = 0; for (int i = 0; i < numnets; i++) { const ID num = idxin[i + 1] - idxin[i]; for (ID j = 0; j < num; j++) xin[pos + j] = std::get<0>(n.num_net_arr[i][j]); for (ID j = 0; j < num; j++) yin[pos + j] = std::get<1>(n.num_net_arr[i][j]); pos += num; } // allocate result storage const int size = 2 * idxin[numnets]; int* idxout = NULL; cudaHostAlloc(&idxout, (numnets + 1) * sizeof(int), cudaHostAllocDefault); if (idxout == NULL) {printf("ERROR: idxout - host memory allocation failed\n\n"); exit(-1);} ctype* xout = NULL; cudaHostAlloc(&xout, size * sizeof(ctype), cudaHostAllocDefault); if (xout == NULL) {printf("ERROR: xout - host memory allocation failed\n\n"); exit(-1);} ctype* yout = NULL; cudaHostAlloc(&yout, size * sizeof(ctype), cudaHostAllocDefault); if (yout == NULL) {printf("ERROR: yout - host memory allocation failed\n\n"); exit(-1);} edge* edges = NULL; cudaHostAlloc(&edges, size * sizeof(edge), cudaHostAllocDefault); if (edges == NULL) {printf("ERROR: edges - host memory allocation failed\n\n"); exit(-1);} // compute Steiner points and edges computeRSMT(idxin, xin, yin, idxout, xout, yout, edges, numnets); // print result long total = 0; if (print) printf("\nresulting tree lengths:\n"); for (int i = 0; i < numnets; i++) { const ctype len = treeLength(idxout[i + 1] - idxout[i], &xout[idxout[i]], &yout[idxout[i]], &edges[idxout[i]]); // body of treeLength function illustrates how to read solution total += len; if (print) { const ID num = std::min(n.net_num_pins[i], MaxPins); printf("%d: %d\n", num, len); } } printf("\ntotal wirelength: %ld\n", total); // clean up free_memory(g, n); cudaFreeHost(edges); cudaFreeHost(yout); cudaFreeHost(xout); cudaFreeHost(idxout); cudaFreeHost(yin); cudaFreeHost(xin); cudaFreeHost(idxin); return 0; }