balance.c 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /* balance.c
  2. * Balance IRQs.
  3. */
  4. #include <stdlib.h>
  5. #include <stdio.h>
  6. #include <string.h>
  7. #include <sys/types.h>
  8. #include <dirent.h>
  9. #include <limits.h>
  10. #include <ctype.h>
  11. #include "statistics.h"
  12. #include "cpu.h"
  13. #include "irq.h"
  14. /* Drop the dont_move flag on all IRQs for specified CPU */
  15. static int dec_weight(cpu_t *cpu)
  16. {
  17. lub_list_node_t *iter;
  18. if (!cpu)
  19. return -1;
  20. for (iter = lub_list_iterator_init(cpu->irqs); iter;
  21. iter = lub_list_iterator_next(iter)) {
  22. irq_t *irq;
  23. irq = (irq_t *)lub_list_node__get_data(iter);
  24. if (irq->weight)
  25. irq->weight--;
  26. }
  27. return 0;
  28. }
  29. /* Remove IRQ from specified CPU */
  30. static int remove_irq_from_cpu(irq_t *irq, cpu_t *cpu)
  31. {
  32. lub_list_node_t *node;
  33. if (!irq || !cpu)
  34. return -1;
  35. irq->cpu = NULL;
  36. node = lub_list_search(cpu->irqs, irq);
  37. if (!node)
  38. return 0;
  39. lub_list_del(cpu->irqs, node);
  40. lub_list_node_free(node);
  41. return 0;
  42. }
  43. /* Move IRQ to specified CPU. Remove IRQ from the IRQ list
  44. of old CPU. */
  45. static int move_irq_to_cpu(irq_t *irq, cpu_t *cpu)
  46. {
  47. if (!irq || !cpu)
  48. return -1;
  49. if (irq->cpu) {
  50. cpu_t *old_cpu = irq->cpu;
  51. remove_irq_from_cpu(irq, old_cpu);
  52. dec_weight(old_cpu);
  53. }
  54. dec_weight(cpu);
  55. irq->cpu = cpu;
  56. lub_list_add(cpu->irqs, irq);
  57. return 0;
  58. }
  59. /* Search for the best CPU. Best CPU is a CPU with minimal load.
  60. If several CPUs have the same load then the best CPU is a CPU
  61. with minimal number of assigned IRQs */
  62. static cpu_t *choose_cpu(lub_list_t *cpus, cpumask_t cpumask, float threshold)
  63. {
  64. lub_list_node_t *iter;
  65. lub_list_t * min_cpus = NULL;
  66. float min_load = 100.00;
  67. lub_list_node_t *node;
  68. cpu_t *cpu = NULL;
  69. for (iter = lub_list_iterator_init(cpus); iter;
  70. iter = lub_list_iterator_next(iter)) {
  71. cpu = (cpu_t *)lub_list_node__get_data(iter);
  72. if (!cpu_isset(cpu->id, cpumask))
  73. continue;
  74. if (cpu->load >= threshold)
  75. continue;
  76. if ((!min_cpus) || (cpu->load < min_load)) {
  77. min_load = cpu->load;
  78. if (!min_cpus)
  79. min_cpus = lub_list_new(cpu_list_compare_len);
  80. while ((node = lub_list__get_tail(min_cpus))) {
  81. lub_list_del(min_cpus, node);
  82. lub_list_node_free(node);
  83. }
  84. lub_list_add(min_cpus, cpu);
  85. }
  86. if (cpu->load == min_load)
  87. lub_list_add(min_cpus, cpu);
  88. }
  89. if (!min_cpus)
  90. return NULL;
  91. node = lub_list__get_head(min_cpus);
  92. cpu = (cpu_t *)lub_list_node__get_data(node);
  93. while ((node = lub_list__get_tail(min_cpus))) {
  94. lub_list_del(min_cpus, node);
  95. lub_list_node_free(node);
  96. }
  97. lub_list_free(min_cpus);
  98. return cpu;
  99. }
  100. static int irq_set_affinity(irq_t *irq, cpumask_t cpumask)
  101. {
  102. char path[PATH_MAX];
  103. char buf[NR_CPUS + 1];
  104. FILE *fd;
  105. if (!irq)
  106. return -1;
  107. sprintf(path, "%s/%u/smp_affinity", PROC_IRQ, irq->irq);
  108. if (!(fd = fopen(path, "w")))
  109. return -1;
  110. cpumask_scnprintf(buf, sizeof(buf), cpumask);
  111. fprintf(fd, "%s", buf);
  112. fclose(fd);
  113. /* Check for newly apllied affinity. The affinities for some
  114. IRQ can't be changed. So don't consider such IRQs. The
  115. example is IRQ 0 - timer. */
  116. irq_get_affinity(irq);
  117. if (!cpus_equal(irq->affinity, cpumask)) {
  118. /* Blacklist this IRQ */
  119. irq->blacklisted = 1;
  120. remove_irq_from_cpu(irq, irq->cpu);
  121. printf("Blacklist IRQ %u\n", irq->irq);
  122. }
  123. return 0;
  124. }
  125. /* Find best CPUs for IRQs need to be balanced. */
  126. int balance(lub_list_t *cpus, lub_list_t *balance_irqs, float threshold)
  127. {
  128. lub_list_node_t *iter;
  129. for (iter = lub_list_iterator_init(balance_irqs); iter;
  130. iter = lub_list_iterator_next(iter)) {
  131. irq_t *irq;
  132. cpu_t *cpu;
  133. irq = (irq_t *)lub_list_node__get_data(iter);
  134. /* Try to find local CPU to move IRQ to.
  135. The local CPU is CPU with native NUMA node. */
  136. cpu = choose_cpu(cpus, irq->local_cpus, threshold);
  137. /* If local CPU is not found then try to use
  138. CPU from another NUMA node. It's better then
  139. overloaded CPUs. */
  140. /* Non-local CPUs were disabled. It seems there is
  141. no advantages to use them. The all interactions will
  142. be held by QPI-like interfaces through local CPUs. */
  143. /* if (!cpu) {
  144. cpumask_t complement;
  145. cpus_complement(complement, irq->local_cpus);
  146. cpu = choose_cpu(cpus, complement, threshold);
  147. }
  148. */
  149. if (cpu) {
  150. if (irq->cpu)
  151. printf("Move IRQ %u from CPU%u to CPU%u\n",
  152. irq->irq, irq->cpu->id, cpu->id);
  153. else
  154. printf("Move IRQ %u to CPU%u\n", irq->irq, cpu->id);
  155. move_irq_to_cpu(irq, cpu);
  156. }
  157. }
  158. return 0;
  159. }
  160. int apply_affinity(lub_list_t *balance_irqs)
  161. {
  162. lub_list_node_t *iter;
  163. for (iter = lub_list_iterator_init(balance_irqs); iter;
  164. iter = lub_list_iterator_next(iter)) {
  165. irq_t *irq;
  166. irq = (irq_t *)lub_list_node__get_data(iter);
  167. if (!irq->cpu)
  168. continue;
  169. irq_set_affinity(irq, irq->cpu->cpumask);
  170. }
  171. return 0;
  172. }
  173. /* Search for the overloaded CPUs and then choose best IRQ for moving to
  174. another CPU. The best IRQ is IRQ with maximum number of interrupts.
  175. The IRQs with small number of interrupts have very low load or very
  176. high load (in a case of NAPI). */
  177. int choose_irqs_to_move(lub_list_t *cpus, lub_list_t *balance_irqs, float threshold)
  178. {
  179. lub_list_node_t *iter;
  180. cpu_t *overloaded_cpu = NULL;
  181. irq_t *irq_to_move = NULL;
  182. float max_load = 0.0;
  183. unsigned long long max_intr = 0;
  184. /* Search for the most overloaded CPU.
  185. The load must be greater than threshold. */
  186. for (iter = lub_list_iterator_init(cpus); iter;
  187. iter = lub_list_iterator_next(iter)) {
  188. cpu_t *cpu = (cpu_t *)lub_list_node__get_data(iter);
  189. if (cpu->load < threshold)
  190. continue;
  191. if (cpu->load > max_load) {
  192. max_load = cpu->load;
  193. overloaded_cpu = cpu;
  194. }
  195. }
  196. /* Can't find overloaded CPUs */
  197. if (!overloaded_cpu)
  198. return 0;
  199. /* Search for the IRQ (owned by overloaded CPU) with
  200. maximum number of interrupts. */
  201. if (lub_list_len(overloaded_cpu->irqs) <= 1)
  202. return 0;
  203. for (iter = lub_list_iterator_init(overloaded_cpu->irqs); iter;
  204. iter = lub_list_iterator_next(iter)) {
  205. irq_t *irq = (irq_t *)lub_list_node__get_data(iter);
  206. /* Don't move any IRQs with intr=0. It can be unused IRQ. In
  207. this case the moving is not needed. It can be overloaded
  208. (by NAPI) IRQs. In this case it will be not moved anyway. */
  209. if (irq->intr == 0)
  210. continue;
  211. if (irq->weight)
  212. continue;
  213. if (irq->intr >= max_intr) {
  214. max_intr = irq->intr;
  215. irq_to_move = irq;
  216. }
  217. }
  218. if (irq_to_move) {
  219. /* Don't move this IRQ while next iteration. */
  220. irq_to_move->weight = 1;
  221. lub_list_add(balance_irqs, irq_to_move);
  222. }
  223. return 0;
  224. }