balance.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /* balance.c
  2. * Balance IRQs.
  3. */
  4. #include <stdlib.h>
  5. #include <stdio.h>
  6. #include <string.h>
  7. #include <sys/types.h>
  8. #include <dirent.h>
  9. #include <limits.h>
  10. #include <ctype.h>
  11. #include <sys/stat.h>
  12. #include <fcntl.h>
  13. #include <unistd.h> /* open, write */
  14. #include "statistics.h"
  15. #include "cpu.h"
  16. #include "irq.h"
  17. #include "balance.h"
  18. /* Drop the dont_move flag on all IRQs for specified CPU */
  19. static int dec_weight(cpu_t *cpu, int value)
  20. {
  21. lub_list_node_t *iter;
  22. if (!cpu)
  23. return -1;
  24. if (value < 0)
  25. return -1;
  26. for (iter = lub_list_iterator_init(cpu->irqs); iter;
  27. iter = lub_list_iterator_next(iter)) {
  28. irq_t *irq;
  29. irq = (irq_t *)lub_list_node__get_data(iter);
  30. if (irq->weight >= value)
  31. irq->weight -= value;
  32. }
  33. return 0;
  34. }
  35. /* Remove IRQ from specified CPU */
  36. int remove_irq_from_cpu(irq_t *irq, cpu_t *cpu)
  37. {
  38. lub_list_node_t *node;
  39. if (!irq || !cpu)
  40. return -1;
  41. irq->cpu = NULL;
  42. node = lub_list_search(cpu->irqs, irq);
  43. if (!node)
  44. return 0;
  45. lub_list_del(cpu->irqs, node);
  46. lub_list_node_free(node);
  47. return 0;
  48. }
  49. /* Move IRQ to specified CPU. Remove IRQ from the IRQ list
  50. * of old CPU.
  51. */
  52. int move_irq_to_cpu(irq_t *irq, cpu_t *cpu)
  53. {
  54. if (!irq || !cpu)
  55. return -1;
  56. if (irq->cpu) {
  57. cpu_t *old_cpu = irq->cpu;
  58. remove_irq_from_cpu(irq, old_cpu);
  59. dec_weight(old_cpu, 1);
  60. }
  61. dec_weight(cpu, 1);
  62. irq->cpu = cpu;
  63. lub_list_add(cpu->irqs, irq);
  64. return 0;
  65. }
  66. /* Search for the best CPU. Best CPU is a CPU with minimal load.
  67. If several CPUs have the same load then the best CPU is a CPU
  68. with minimal number of assigned IRQs */
  69. static cpu_t *choose_cpu(lub_list_t *cpus, cpumask_t *cpumask, float load_limit)
  70. {
  71. lub_list_node_t *iter;
  72. lub_list_t * min_cpus = NULL;
  73. float min_load = 100.00;
  74. lub_list_node_t *node;
  75. cpu_t *cpu = NULL;
  76. for (iter = lub_list_iterator_init(cpus); iter;
  77. iter = lub_list_iterator_next(iter)) {
  78. cpu = (cpu_t *)lub_list_node__get_data(iter);
  79. if (!cpu_isset(cpu->id, *cpumask))
  80. continue;
  81. if (cpu->load >= load_limit)
  82. continue;
  83. if ((!min_cpus) || (cpu->load < min_load)) {
  84. min_load = cpu->load;
  85. if (!min_cpus)
  86. min_cpus = lub_list_new(cpu_list_compare_len);
  87. while ((node = lub_list__get_tail(min_cpus))) {
  88. lub_list_del(min_cpus, node);
  89. lub_list_node_free(node);
  90. }
  91. lub_list_add(min_cpus, cpu);
  92. }
  93. if (cpu->load == min_load)
  94. lub_list_add(min_cpus, cpu);
  95. }
  96. if (!min_cpus)
  97. return NULL;
  98. node = lub_list__get_head(min_cpus);
  99. cpu = (cpu_t *)lub_list_node__get_data(node);
  100. while ((node = lub_list__get_tail(min_cpus))) {
  101. lub_list_del(min_cpus, node);
  102. lub_list_node_free(node);
  103. }
  104. lub_list_free(min_cpus);
  105. return cpu;
  106. }
  107. static int irq_set_affinity(irq_t *irq, cpumask_t *cpumask)
  108. {
  109. char path[PATH_MAX];
  110. char buf[NR_CPUS + 1];
  111. int f;
  112. if (!irq)
  113. return -1;
  114. snprintf(path, sizeof(path),
  115. "%s/%u/smp_affinity", PROC_IRQ, irq->irq);
  116. path[sizeof(path) - 1] = '\0';
  117. if ((f = open(path, O_WRONLY | O_SYNC)) < 0)
  118. return -1;
  119. cpumask_scnprintf(buf, sizeof(buf), *cpumask);
  120. buf[sizeof(buf) - 1] = '\0';
  121. if (write(f, buf, strlen(buf)) < 0) {
  122. /* The affinity for some IRQ can't be changed. So don't
  123. consider such IRQs. The example is IRQ 0 - timer.
  124. Blacklist this IRQ. Note fprintf() without fflush()
  125. will not return I/O error due to buffers. */
  126. irq->blacklisted = 1;
  127. remove_irq_from_cpu(irq, irq->cpu);
  128. printf("Blacklist IRQ %u\n", irq->irq);
  129. }
  130. close(f);
  131. return 0;
  132. }
  133. /* Find best CPUs for IRQs need to be balanced. */
  134. int balance(lub_list_t *cpus, lub_list_t *balance_irqs,
  135. float load_limit, cpumask_t *exclude_cpus, int non_local_cpus)
  136. {
  137. lub_list_node_t *iter;
  138. for (iter = lub_list_iterator_init(balance_irqs); iter;
  139. iter = lub_list_iterator_next(iter)) {
  140. irq_t *irq;
  141. cpu_t *cpu;
  142. cpumask_t possible_cpus;
  143. irq = (irq_t *)lub_list_node__get_data(iter);
  144. /* Try to find local CPU to move IRQ to.
  145. The local CPU is CPU with native NUMA node. */
  146. /* Possible CPUs is local CPUs minus exclude-CPUs.
  147. possible_cpus = local_cpus & ~exclude_cpus */
  148. cpus_init(possible_cpus);
  149. cpus_copy(possible_cpus, *exclude_cpus);
  150. cpus_complement(possible_cpus, possible_cpus);
  151. cpus_and(possible_cpus, possible_cpus, irq->local_cpus);
  152. cpu = choose_cpu(cpus, &possible_cpus, load_limit);
  153. cpus_free(possible_cpus);
  154. /* If local CPU is not found then try to use
  155. CPU from another NUMA node. It's better then
  156. overloaded CPUs. */
  157. /* Non-local CPUs were disabled. It seems there is
  158. no advantages to use them. The all interactions will
  159. be held by QPI-like interfaces through local CPUs. */
  160. /* May be the previous note is wrong. Using of non local
  161. cpus depends on config option "non_local_cpus" now. */
  162. if (!cpu && non_local_cpus) {
  163. cpus_init(possible_cpus);
  164. cpus_copy(possible_cpus, *exclude_cpus);
  165. cpus_or(possible_cpus, possible_cpus, irq->local_cpus);
  166. cpus_complement(possible_cpus, possible_cpus);
  167. cpu = choose_cpu(cpus, &possible_cpus, load_limit);
  168. cpus_free(possible_cpus);
  169. }
  170. if (cpu) {
  171. if (irq->cpu)
  172. printf("Move IRQ %u from CPU%u to CPU%u\n",
  173. irq->irq, irq->cpu->id, cpu->id);
  174. else
  175. printf("Move IRQ %u to CPU%u\n", irq->irq, cpu->id);
  176. move_irq_to_cpu(irq, cpu);
  177. }
  178. }
  179. return 0;
  180. }
  181. int apply_affinity(lub_list_t *balance_irqs)
  182. {
  183. lub_list_node_t *iter;
  184. for (iter = lub_list_iterator_init(balance_irqs); iter;
  185. iter = lub_list_iterator_next(iter)) {
  186. irq_t *irq;
  187. irq = (irq_t *)lub_list_node__get_data(iter);
  188. if (!irq->cpu)
  189. continue;
  190. irq_set_affinity(irq, &(irq->cpu->cpumask));
  191. }
  192. return 0;
  193. }
  194. /* Count the number of intr-not-null IRQs and minimal IRQ weight */
  195. static int irq_list_info(lub_list_t *irqs, int *min_weight,
  196. unsigned int *irq_num, unsigned int *candidates_num)
  197. {
  198. lub_list_node_t *iter;
  199. if (!irqs)
  200. return -1;
  201. if (min_weight)
  202. *min_weight = -1;
  203. if (irq_num)
  204. *irq_num = 0;
  205. if (candidates_num)
  206. *candidates_num = 0;
  207. for (iter = lub_list_iterator_init(irqs); iter;
  208. iter = lub_list_iterator_next(iter)) {
  209. irq_t *irq = (irq_t *)lub_list_node__get_data(iter);
  210. if (irq->intr == 0)
  211. continue;
  212. if (min_weight) {
  213. if ((*min_weight < 0) || (irq->weight < *min_weight))
  214. *min_weight = irq->weight;
  215. }
  216. if (irq_num)
  217. *irq_num += 1;
  218. if (irq->weight)
  219. continue;
  220. if (candidates_num)
  221. *candidates_num += 1;
  222. }
  223. return 0;
  224. }
  225. /* Search for most overloaded CPU */
  226. static cpu_t * most_overloaded_cpu(lub_list_t *cpus, float threshold)
  227. {
  228. lub_list_node_t *iter;
  229. cpu_t *overloaded_cpu = NULL;
  230. float max_load = 0.0;
  231. /* Search for the most overloaded CPU.
  232. The load must be greater than threshold. */
  233. for (iter = lub_list_iterator_init(cpus); iter;
  234. iter = lub_list_iterator_next(iter)) {
  235. cpu_t *cpu = (cpu_t *)lub_list_node__get_data(iter);
  236. int min_weight = -1;
  237. unsigned int irq_num = 0;
  238. if (cpu->load < threshold)
  239. continue;
  240. if (cpu->load <= max_load)
  241. continue;
  242. /* Don't move last IRQ */
  243. if (lub_list_len(cpu->irqs) <= 1)
  244. continue;
  245. irq_list_info(cpu->irqs, &min_weight, &irq_num, NULL);
  246. /* All IRQs has intr=0 */
  247. if (irq_num == 0)
  248. continue;
  249. if (min_weight > 0)
  250. dec_weight(cpu, min_weight);
  251. /* Ok, it's good CPU to try to free it */
  252. max_load = cpu->load;
  253. overloaded_cpu = cpu;
  254. }
  255. return overloaded_cpu;
  256. }
  257. /* Search for the overloaded CPUs and then choose best IRQ for moving to
  258. another CPU. The best IRQ is IRQ with maximum number of interrupts.
  259. The IRQs with small number of interrupts have very low load or very
  260. high load (in a case of NAPI). */
  261. int choose_irqs_to_move(lub_list_t *cpus, lub_list_t *balance_irqs,
  262. float threshold, birq_choose_strategy_e strategy,
  263. cpumask_t *exclude_cpus)
  264. {
  265. lub_list_node_t *iter;
  266. cpu_t *overloaded_cpu = NULL;
  267. irq_t *irq_to_move = NULL;
  268. unsigned long long max_intr = 0;
  269. unsigned long long min_intr = (unsigned long long)(-1);
  270. unsigned int choose = 0;
  271. unsigned int current = 0;
  272. /* Stage 1: Try to move active IRQs from excluded-CPUs */
  273. if (!cpus_empty(*exclude_cpus)) {
  274. /* Iterate CPU list and find excluded ones */
  275. for (iter = lub_list_iterator_init(cpus); iter;
  276. iter = lub_list_iterator_next(iter)) {
  277. lub_list_node_t *iter2;
  278. cpu_t *cpu = (cpu_t *)lub_list_node__get_data(iter);
  279. if (!cpu_isset(cpu->id, *exclude_cpus))
  280. continue;
  281. /* Move all active IRQs to another CPUs */
  282. for (iter2 = lub_list_iterator_init(cpu->irqs); iter2;
  283. iter2 = lub_list_iterator_next(iter2)) {
  284. irq_t *irq = (irq_t *)lub_list_node__get_data(iter2);
  285. if (irq->intr == 0)
  286. continue;
  287. lub_list_add(balance_irqs, irq);
  288. }
  289. }
  290. }
  291. /* Stage 2: Move IRQs from overloaded CPUs */
  292. /* Search for overloaded CPUs */
  293. if (!(overloaded_cpu = most_overloaded_cpu(cpus, threshold)))
  294. return 0;
  295. if (strategy == BIRQ_CHOOSE_RND) {
  296. unsigned int candidates = 0;
  297. irq_list_info(overloaded_cpu->irqs, NULL, NULL, &candidates);
  298. if (candidates == 0)
  299. return 0;
  300. choose = rand() % candidates;
  301. }
  302. /* Search for the IRQ (owned by overloaded CPU) with
  303. maximum/minimum number of interrupts. */
  304. for (iter = lub_list_iterator_init(overloaded_cpu->irqs); iter;
  305. iter = lub_list_iterator_next(iter)) {
  306. irq_t *irq = (irq_t *)lub_list_node__get_data(iter);
  307. /* Don't move any IRQs with intr=0. It can be unused IRQ. In
  308. this case the moving is not needed. It can be overloaded
  309. (by NAPI) IRQs. In this case it will be not moved anyway. */
  310. if (irq->intr == 0)
  311. continue;
  312. if (irq->weight)
  313. continue;
  314. if (strategy == BIRQ_CHOOSE_MAX) {
  315. /* Get IRQ with max intr */
  316. if (irq->intr > max_intr) {
  317. max_intr = irq->intr;
  318. irq_to_move = irq;
  319. }
  320. } else if (strategy == BIRQ_CHOOSE_MIN) {
  321. /* Get IRQ with min intr */
  322. if (irq->intr < min_intr) {
  323. min_intr = irq->intr;
  324. irq_to_move = irq;
  325. }
  326. } else if (strategy == BIRQ_CHOOSE_RND) {
  327. if (current == choose) {
  328. irq_to_move = irq;
  329. break;
  330. }
  331. }
  332. current++;
  333. }
  334. if (irq_to_move) {
  335. /* Don't move this IRQ while next iteration. */
  336. irq_to_move->weight = 1;
  337. lub_list_add(balance_irqs, irq_to_move);
  338. }
  339. return 0;
  340. }