Training courses

Kernel and Embedded Linux

Bootlin training courses

Embedded Linux, kernel,
Yocto Project, Buildroot, real-time,
graphics, boot time, debugging...

Bootlin logo

Elixir Cross Referencer

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2012, Microsoft Corporation.
 *
 * Author:
 *   K. Y. Srinivasan <kys@microsoft.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/memory_hotplug.h>
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/percpu_counter.h>

#include <linux/hyperv.h>

#define CREATE_TRACE_POINTS
#include "hv_trace_balloon.h"

/*
 * We begin with definitions supporting the Dynamic Memory protocol
 * with the host.
 *
 * Begin protocol definitions.
 */



/*
 * Protocol versions. The low word is the minor version, the high word the major
 * version.
 *
 * History:
 * Initial version 1.0
 * Changed to 0.1 on 2009/03/25
 * Changes to 0.2 on 2009/05/14
 * Changes to 0.3 on 2009/12/03
 * Changed to 1.0 on 2011/04/05
 */

#define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
#define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
#define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)

enum {
	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
	DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0),

	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
	DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,

	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
};



/*
 * Message Types
 */

enum dm_message_type {
	/*
	 * Version 0.3
	 */
	DM_ERROR			= 0,
	DM_VERSION_REQUEST		= 1,
	DM_VERSION_RESPONSE		= 2,
	DM_CAPABILITIES_REPORT		= 3,
	DM_CAPABILITIES_RESPONSE	= 4,
	DM_STATUS_REPORT		= 5,
	DM_BALLOON_REQUEST		= 6,
	DM_BALLOON_RESPONSE		= 7,
	DM_UNBALLOON_REQUEST		= 8,
	DM_UNBALLOON_RESPONSE		= 9,
	DM_MEM_HOT_ADD_REQUEST		= 10,
	DM_MEM_HOT_ADD_RESPONSE		= 11,
	DM_VERSION_03_MAX		= 11,
	/*
	 * Version 1.0.
	 */
	DM_INFO_MESSAGE			= 12,
	DM_VERSION_1_MAX		= 12
};


/*
 * Structures defining the dynamic memory management
 * protocol.
 */

union dm_version {
	struct {
		__u16 minor_version;
		__u16 major_version;
	};
	__u32 version;
} __packed;


union dm_caps {
	struct {
		__u64 balloon:1;
		__u64 hot_add:1;
		/*
		 * To support guests that may have alignment
		 * limitations on hot-add, the guest can specify
		 * its alignment requirements; a value of n
		 * represents an alignment of 2^n in mega bytes.
		 */
		__u64 hot_add_alignment:4;
		__u64 reservedz:58;
	} cap_bits;
	__u64 caps;
} __packed;

union dm_mem_page_range {
	struct  {
		/*
		 * The PFN number of the first page in the range.
		 * 40 bits is the architectural limit of a PFN
		 * number for AMD64.
		 */
		__u64 start_page:40;
		/*
		 * The number of pages in the range.
		 */
		__u64 page_cnt:24;
	} finfo;
	__u64  page_range;
} __packed;



/*
 * The header for all dynamic memory messages:
 *
 * type: Type of the message.
 * size: Size of the message in bytes; including the header.
 * trans_id: The guest is responsible for manufacturing this ID.
 */

struct dm_header {
	__u16 type;
	__u16 size;
	__u32 trans_id;
} __packed;

/*
 * A generic message format for dynamic memory.
 * Specific message formats are defined later in the file.
 */

struct dm_message {
	struct dm_header hdr;
	__u8 data[]; /* enclosed message */
} __packed;


/*
 * Specific message types supporting the dynamic memory protocol.
 */

/*
 * Version negotiation message. Sent from the guest to the host.
 * The guest is free to try different versions until the host
 * accepts the version.
 *
 * dm_version: The protocol version requested.
 * is_last_attempt: If TRUE, this is the last version guest will request.
 * reservedz: Reserved field, set to zero.
 */

struct dm_version_request {
	struct dm_header hdr;
	union dm_version version;
	__u32 is_last_attempt:1;
	__u32 reservedz:31;
} __packed;

/*
 * Version response message; Host to Guest and indicates
 * if the host has accepted the version sent by the guest.
 *
 * is_accepted: If TRUE, host has accepted the version and the guest
 * should proceed to the next stage of the protocol. FALSE indicates that
 * guest should re-try with a different version.
 *
 * reservedz: Reserved field, set to zero.
 */

struct dm_version_response {
	struct dm_header hdr;
	__u64 is_accepted:1;
	__u64 reservedz:63;
} __packed;

/*
 * Message reporting capabilities. This is sent from the guest to the
 * host.
 */

struct dm_capabilities {
	struct dm_header hdr;
	union dm_caps caps;
	__u64 min_page_cnt;
	__u64 max_page_number;
} __packed;

/*
 * Response to the capabilities message. This is sent from the host to the
 * guest. This message notifies if the host has accepted the guest's
 * capabilities. If the host has not accepted, the guest must shutdown
 * the service.
 *
 * is_accepted: Indicates if the host has accepted guest's capabilities.
 * reservedz: Must be 0.
 */

struct dm_capabilities_resp_msg {
	struct dm_header hdr;
	__u64 is_accepted:1;
	__u64 reservedz:63;
} __packed;

/*
 * This message is used to report memory pressure from the guest.
 * This message is not part of any transaction and there is no
 * response to this message.
 *
 * num_avail: Available memory in pages.
 * num_committed: Committed memory in pages.
 * page_file_size: The accumulated size of all page files
 *		   in the system in pages.
 * zero_free: The nunber of zero and free pages.
 * page_file_writes: The writes to the page file in pages.
 * io_diff: An indicator of file cache efficiency or page file activity,
 *	    calculated as File Cache Page Fault Count - Page Read Count.
 *	    This value is in pages.
 *
 * Some of these metrics are Windows specific and fortunately
 * the algorithm on the host side that computes the guest memory
 * pressure only uses num_committed value.
 */

struct dm_status {
	struct dm_header hdr;
	__u64 num_avail;
	__u64 num_committed;
	__u64 page_file_size;
	__u64 zero_free;
	__u32 page_file_writes;
	__u32 io_diff;
} __packed;


/*
 * Message to ask the guest to allocate memory - balloon up message.
 * This message is sent from the host to the guest. The guest may not be
 * able to allocate as much memory as requested.
 *
 * num_pages: number of pages to allocate.
 */

struct dm_balloon {
	struct dm_header hdr;
	__u32 num_pages;
	__u32 reservedz;
} __packed;


/*
 * Balloon response message; this message is sent from the guest
 * to the host in response to the balloon message.
 *
 * reservedz: Reserved; must be set to zero.
 * more_pages: If FALSE, this is the last message of the transaction.
 * if TRUE there will atleast one more message from the guest.
 *
 * range_count: The number of ranges in the range array.
 *
 * range_array: An array of page ranges returned to the host.
 *
 */

struct dm_balloon_response {
	struct dm_header hdr;
	__u32 reservedz;
	__u32 more_pages:1;
	__u32 range_count:31;
	union dm_mem_page_range range_array[];
} __packed;

/*
 * Un-balloon message; this message is sent from the host
 * to the guest to give guest more memory.
 *
 * more_pages: If FALSE, this is the last message of the transaction.
 * if TRUE there will atleast one more message from the guest.
 *
 * reservedz: Reserved; must be set to zero.
 *
 * range_count: The number of ranges in the range array.
 *
 * range_array: An array of page ranges returned to the host.
 *
 */

struct dm_unballoon_request {
	struct dm_header hdr;
	__u32 more_pages:1;
	__u32 reservedz:31;
	__u32 range_count;
	union dm_mem_page_range range_array[];
} __packed;

/*
 * Un-balloon response message; this message is sent from the guest
 * to the host in response to an unballoon request.
 *
 */

struct dm_unballoon_response {
	struct dm_header hdr;
} __packed;


/*
 * Hot add request message. Message sent from the host to the guest.
 *
 * mem_range: Memory range to hot add.
 *
 * On Linux we currently don't support this since we cannot hot add
 * arbitrary granularity of memory.
 */

struct dm_hot_add {
	struct dm_header hdr;
	union dm_mem_page_range range;
} __packed;

/*
 * Hot add response message.
 * This message is sent by the guest to report the status of a hot add request.
 * If page_count is less than the requested page count, then the host should
 * assume all further hot add requests will fail, since this indicates that
 * the guest has hit an upper physical memory barrier.
 *
 * Hot adds may also fail due to low resources; in this case, the guest must
 * not complete this message until the hot add can succeed, and the host must
 * not send a new hot add request until the response is sent.
 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
 * times it fails the request.
 *
 *
 * page_count: number of pages that were successfully hot added.
 *
 * result: result of the operation 1: success, 0: failure.
 *
 */

struct dm_hot_add_response {
	struct dm_header hdr;
	__u32 page_count;
	__u32 result;
} __packed;

/*
 * Types of information sent from host to the guest.
 */

enum dm_info_type {
	INFO_TYPE_MAX_PAGE_CNT = 0,
	MAX_INFO_TYPE
};


/*
 * Header for the information message.
 */

struct dm_info_header {
	enum dm_info_type type;
	__u32 data_size;
} __packed;

/*
 * This message is sent from the host to the guest to pass
 * some relevant information (win8 addition).
 *
 * reserved: no used.
 * info_size: size of the information blob.
 * info: information blob.
 */

struct dm_info_msg {
	struct dm_header hdr;
	__u32 reserved;
	__u32 info_size;
	__u8  info[];
};

/*
 * End protocol definitions.
 */

/*
 * State to manage hot adding memory into the guest.
 * The range start_pfn : end_pfn specifies the range
 * that the host has asked us to hot add. The range
 * start_pfn : ha_end_pfn specifies the range that we have
 * currently hot added. We hot add in multiples of 128M
 * chunks; it is possible that we may not be able to bring
 * online all the pages in the region. The range
 * covered_start_pfn:covered_end_pfn defines the pages that can
 * be brough online.
 */

struct hv_hotadd_state {
	struct list_head list;
	unsigned long start_pfn;
	unsigned long covered_start_pfn;
	unsigned long covered_end_pfn;
	unsigned long ha_end_pfn;
	unsigned long end_pfn;
	/*
	 * A list of gaps.
	 */
	struct list_head gap_list;
};

struct hv_hotadd_gap {
	struct list_head list;
	unsigned long start_pfn;
	unsigned long end_pfn;
};

struct balloon_state {
	__u32 num_pages;
	struct work_struct wrk;
};

struct hot_add_wrk {
	union dm_mem_page_range ha_page_range;
	union dm_mem_page_range ha_region_range;
	struct work_struct wrk;
};

static bool hot_add = true;
static bool do_hot_add;
/*
 * Delay reporting memory pressure by
 * the specified number of seconds.
 */
static uint pressure_report_delay = 45;

/*
 * The last time we posted a pressure report to host.
 */
static unsigned long last_post_time;

module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");

module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
static atomic_t trans_id = ATOMIC_INIT(0);

static int dm_ring_size = (5 * PAGE_SIZE);

/*
 * Driver specific state.
 */

enum hv_dm_state {
	DM_INITIALIZING = 0,
	DM_INITIALIZED,
	DM_BALLOON_UP,
	DM_BALLOON_DOWN,
	DM_HOT_ADD,
	DM_INIT_ERROR
};


static __u8 recv_buffer[PAGE_SIZE];
static __u8 balloon_up_send_buffer[PAGE_SIZE];
#define PAGES_IN_2M	512
#define HA_CHUNK (32 * 1024)

struct hv_dynmem_device {
	struct hv_device *dev;
	enum hv_dm_state state;
	struct completion host_event;
	struct completion config_event;

	/*
	 * Number of pages we have currently ballooned out.
	 */
	unsigned int num_pages_ballooned;
	unsigned int num_pages_onlined;
	unsigned int num_pages_added;

	/*
	 * State to manage the ballooning (up) operation.
	 */
	struct balloon_state balloon_wrk;

	/*
	 * State to execute the "hot-add" operation.
	 */
	struct hot_add_wrk ha_wrk;

	/*
	 * This state tracks if the host has specified a hot-add
	 * region.
	 */
	bool host_specified_ha_region;

	/*
	 * State to synchronize hot-add.
	 */
	struct completion  ol_waitevent;
	bool ha_waiting;
	/*
	 * This thread handles hot-add
	 * requests from the host as well as notifying
	 * the host with regards to memory pressure in
	 * the guest.
	 */
	struct task_struct *thread;

	/*
	 * Protects ha_region_list, num_pages_onlined counter and individual
	 * regions from ha_region_list.
	 */
	spinlock_t ha_lock;

	/*
	 * A list of hot-add regions.
	 */
	struct list_head ha_region_list;

	/*
	 * We start with the highest version we can support
	 * and downgrade based on the host; we save here the
	 * next version to try.
	 */
	__u32 next_version;

	/*
	 * The negotiated version agreed by host.
	 */
	__u32 version;
};

static struct hv_dynmem_device dm_device;

static void post_status(struct hv_dynmem_device *dm);

#ifdef CONFIG_MEMORY_HOTPLUG
static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
				     unsigned long pfn)
{
	struct hv_hotadd_gap *gap;

	/* The page is not backed. */
	if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn))
		return false;

	/* Check for gaps. */
	list_for_each_entry(gap, &has->gap_list, list) {
		if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
			return false;
	}

	return true;
}

static unsigned long hv_page_offline_check(unsigned long start_pfn,
					   unsigned long nr_pages)
{
	unsigned long pfn = start_pfn, count = 0;
	struct hv_hotadd_state *has;
	bool found;

	while (pfn < start_pfn + nr_pages) {
		/*
		 * Search for HAS which covers the pfn and when we find one
		 * count how many consequitive PFNs are covered.
		 */
		found = false;
		list_for_each_entry(has, &dm_device.ha_region_list, list) {
			while ((pfn >= has->start_pfn) &&
			       (pfn < has->end_pfn) &&
			       (pfn < start_pfn + nr_pages)) {
				found = true;
				if (has_pfn_is_backed(has, pfn))
					count++;
				pfn++;
			}
		}

		/*
		 * This PFN is not in any HAS (e.g. we're offlining a region
		 * which was present at boot), no need to account for it. Go
		 * to the next one.
		 */
		if (!found)
			pfn++;
	}

	return count;
}

static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
			      void *v)
{
	struct memory_notify *mem = (struct memory_notify *)v;
	unsigned long flags, pfn_count;

	switch (val) {
	case MEM_ONLINE:
	case MEM_CANCEL_ONLINE:
		if (dm_device.ha_waiting) {
			dm_device.ha_waiting = false;
			complete(&dm_device.ol_waitevent);
		}
		break;

	case MEM_OFFLINE:
		spin_lock_irqsave(&dm_device.ha_lock, flags);
		pfn_count = hv_page_offline_check(mem->start_pfn,
						  mem->nr_pages);
		if (pfn_count <= dm_device.num_pages_onlined) {
			dm_device.num_pages_onlined -= pfn_count;
		} else {
			/*
			 * We're offlining more pages than we managed to online.
			 * This is unexpected. In any case don't let
			 * num_pages_onlined wrap around zero.
			 */
			WARN_ON_ONCE(1);
			dm_device.num_pages_onlined = 0;
		}
		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
		break;
	case MEM_GOING_ONLINE:
	case MEM_GOING_OFFLINE:
	case MEM_CANCEL_OFFLINE:
		break;
	}
	return NOTIFY_OK;
}

static struct notifier_block hv_memory_nb = {
	.notifier_call = hv_memory_notifier,
	.priority = 0
};

/* Check if the particular page is backed and can be onlined and online it. */
static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
{
	if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
		if (!PageOffline(pg))
			__SetPageOffline(pg);
		return;
	}
	if (PageOffline(pg))
		__ClearPageOffline(pg);

	/* This frame is currently backed; online the page. */
	__online_page_set_limits(pg);
	__online_page_increment_counters(pg);
	__online_page_free(pg);

	lockdep_assert_held(&dm_device.ha_lock);
	dm_device.num_pages_onlined++;
}

static void hv_bring_pgs_online(struct hv_hotadd_state *has,
				unsigned long start_pfn, unsigned long size)
{
	int i;

	pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
	for (i = 0; i < size; i++)
		hv_page_online_one(has, pfn_to_page(start_pfn + i));
}

static void hv_mem_hot_add(unsigned long start, unsigned long size,
				unsigned long pfn_count,
				struct hv_hotadd_state *has)
{
	int ret = 0;
	int i, nid;
	unsigned long start_pfn;
	unsigned long processed_pfn;
	unsigned long total_pfn = pfn_count;
	unsigned long flags;

	for (i = 0; i < (size/HA_CHUNK); i++) {
		start_pfn = start + (i * HA_CHUNK);

		spin_lock_irqsave(&dm_device.ha_lock, flags);
		has->ha_end_pfn +=  HA_CHUNK;

		if (total_pfn > HA_CHUNK) {
			processed_pfn = HA_CHUNK;
			total_pfn -= HA_CHUNK;
		} else {
			processed_pfn = total_pfn;
			total_pfn = 0;
		}

		has->covered_end_pfn +=  processed_pfn;
		spin_unlock_irqrestore(&dm_device.ha_lock, flags);

		init_completion(&dm_device.ol_waitevent);
		dm_device.ha_waiting = !memhp_auto_online;

		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
		ret = add_memory(nid, PFN_PHYS((start_pfn)),
				(HA_CHUNK << PAGE_SHIFT));

		if (ret) {
			pr_err("hot_add memory failed error is %d\n", ret);
			if (ret == -EEXIST) {
				/*
				 * This error indicates that the error
				 * is not a transient failure. This is the
				 * case where the guest's physical address map
				 * precludes hot adding memory. Stop all further
				 * memory hot-add.
				 */
				do_hot_add = false;
			}
			spin_lock_irqsave(&dm_device.ha_lock, flags);
			has->ha_end_pfn -= HA_CHUNK;
			has->covered_end_pfn -=  processed_pfn;
			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
			break;
		}

		/*
		 * Wait for the memory block to be onlined when memory onlining
		 * is done outside of kernel (memhp_auto_online). Since the hot
		 * add has succeeded, it is ok to proceed even if the pages in
		 * the hot added region have not been "onlined" within the
		 * allowed time.
		 */
		if (dm_device.ha_waiting)
			wait_for_completion_timeout(&dm_device.ol_waitevent,
						    5*HZ);
		post_status(&dm_device);
	}
}

static void hv_online_page(struct page *pg, unsigned int order)
{
	struct hv_hotadd_state *has;
	unsigned long flags;
	unsigned long pfn = page_to_pfn(pg);

	spin_lock_irqsave(&dm_device.ha_lock, flags);
	list_for_each_entry(has, &dm_device.ha_region_list, list) {
		/* The page belongs to a different HAS. */
		if ((pfn < has->start_pfn) ||
				(pfn + (1UL << order) > has->end_pfn))
			continue;

		hv_bring_pgs_online(has, pfn, 1UL << order);
		break;
	}
	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
}

static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
{
	struct hv_hotadd_state *has;
	struct hv_hotadd_gap *gap;
	unsigned long residual, new_inc;
	int ret = 0;
	unsigned long flags;

	spin_lock_irqsave(&dm_device.ha_lock, flags);
	list_for_each_entry(has, &dm_device.ha_region_list, list) {
		/*
		 * If the pfn range we are dealing with is not in the current
		 * "hot add block", move on.
		 */
		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
			continue;

		/*
		 * If the current start pfn is not where the covered_end
		 * is, create a gap and update covered_end_pfn.
		 */
		if (has->covered_end_pfn != start_pfn) {
			gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
			if (!gap) {
				ret = -ENOMEM;
				break;
			}

			INIT_LIST_HEAD(&gap->list);
			gap->start_pfn = has->covered_end_pfn;
			gap->end_pfn = start_pfn;
			list_add_tail(&gap->list, &has->gap_list);

			has->covered_end_pfn = start_pfn;
		}

		/*
		 * If the current hot add-request extends beyond
		 * our current limit; extend it.
		 */
		if ((start_pfn + pfn_cnt) > has->end_pfn) {
			residual = (start_pfn + pfn_cnt - has->end_pfn);
			/*
			 * Extend the region by multiples of HA_CHUNK.
			 */
			new_inc = (residual / HA_CHUNK) * HA_CHUNK;
			if (residual % HA_CHUNK)
				new_inc += HA_CHUNK;

			has->end_pfn += new_inc;
		}

		ret = 1;
		break;
	}
	spin_unlock_irqrestore(&dm_device.ha_lock, flags);

	return ret;
}

static unsigned long handle_pg_range(unsigned long pg_start,
					unsigned long pg_count)
{
	unsigned long start_pfn = pg_start;
	unsigned long pfn_cnt = pg_count;
	unsigned long size;
	struct hv_hotadd_state *has;
	unsigned long pgs_ol = 0;
	unsigned long old_covered_state;
	unsigned long res = 0, flags;

	pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
		pg_start);

	spin_lock_irqsave(&dm_device.ha_lock, flags);
	list_for_each_entry(has, &dm_device.ha_region_list, list) {
		/*
		 * If the pfn range we are dealing with is not in the current
		 * "hot add block", move on.
		 */
		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
			continue;

		old_covered_state = has->covered_end_pfn;

		if (start_pfn < has->ha_end_pfn) {
			/*
			 * This is the case where we are backing pages
			 * in an already hot added region. Bring
			 * these pages online first.
			 */
			pgs_ol = has->ha_end_pfn - start_pfn;
			if (pgs_ol > pfn_cnt)
				pgs_ol = pfn_cnt;

			has->covered_end_pfn +=  pgs_ol;
			pfn_cnt -= pgs_ol;
			/*
			 * Check if the corresponding memory block is already
			 * online. It is possible to observe struct pages still
			 * being uninitialized here so check section instead.
			 * In case the section is online we need to bring the
			 * rest of pfns (which were not backed previously)
			 * online too.
			 */
			if (start_pfn > has->start_pfn &&
			    online_section_nr(pfn_to_section_nr(start_pfn)))
				hv_bring_pgs_online(has, start_pfn, pgs_ol);

		}

		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
			/*
			 * We have some residual hot add range
			 * that needs to be hot added; hot add
			 * it now. Hot add a multiple of
			 * of HA_CHUNK that fully covers the pages
			 * we have.
			 */
			size = (has->end_pfn - has->ha_end_pfn);
			if (pfn_cnt <= size) {
				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
				if (pfn_cnt % HA_CHUNK)
					size += HA_CHUNK;
			} else {
				pfn_cnt = size;
			}
			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
			hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
			spin_lock_irqsave(&dm_device.ha_lock, flags);
		}
		/*
		 * If we managed to online any pages that were given to us,
		 * we declare success.
		 */
		res = has->covered_end_pfn - old_covered_state;
		break;
	}
	spin_unlock_irqrestore(&dm_device.ha_lock, flags);

	return res;
}

static unsigned long process_hot_add(unsigned long pg_start,
					unsigned long pfn_cnt,
					unsigned long rg_start,
					unsigned long rg_size)
{
	struct hv_hotadd_state *ha_region = NULL;
	int covered;
	unsigned long flags;

	if (pfn_cnt == 0)
		return 0;

	if (!dm_device.host_specified_ha_region) {
		covered = pfn_covered(pg_start, pfn_cnt);
		if (covered < 0)
			return 0;

		if (covered)
			goto do_pg_range;
	}

	/*
	 * If the host has specified a hot-add range; deal with it first.
	 */

	if (rg_size != 0) {
		ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
		if (!ha_region)
			return 0;

		INIT_LIST_HEAD(&ha_region->list);
		INIT_LIST_HEAD(&ha_region->gap_list);

		ha_region->start_pfn = rg_start;
		ha_region->ha_end_pfn = rg_start;
		ha_region->covered_start_pfn = pg_start;
		ha_region->covered_end_pfn = pg_start;
		ha_region->end_pfn = rg_start + rg_size;

		spin_lock_irqsave(&dm_device.ha_lock, flags);
		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
	}

do_pg_range:
	/*
	 * Process the page range specified; bringing them
	 * online if possible.
	 */
	return handle_pg_range(pg_start, pfn_cnt);
}

#endif

static void hot_add_req(struct work_struct *dummy)
{
	struct dm_hot_add_response resp;
#ifdef CONFIG_MEMORY_HOTPLUG
	unsigned long pg_start, pfn_cnt;
	unsigned long rg_start, rg_sz;
#endif
	struct hv_dynmem_device *dm = &dm_device;

	memset(&resp, 0, sizeof(struct dm_hot_add_response));
	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
	resp.hdr.size = sizeof(struct dm_hot_add_response);

#ifdef CONFIG_MEMORY_HOTPLUG
	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;

	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;

	if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
		unsigned long region_size;
		unsigned long region_start;

		/*
		 * The host has not specified the hot-add region.
		 * Based on the hot-add page range being specified,
		 * compute a hot-add region that can cover the pages
		 * that need to be hot-added while ensuring the alignment
		 * and size requirements of Linux as it relates to hot-add.
		 */
		region_start = pg_start;
		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
		if (pfn_cnt % HA_CHUNK)
			region_size += HA_CHUNK;

		region_start = (pg_start / HA_CHUNK) * HA_CHUNK;

		rg_start = region_start;
		rg_sz = region_size;
	}

	if (do_hot_add)
		resp.page_count = process_hot_add(pg_start, pfn_cnt,
						rg_start, rg_sz);

	dm->num_pages_added += resp.page_count;
#endif
	/*
	 * The result field of the response structure has the
	 * following semantics:
	 *
	 * 1. If all or some pages hot-added: Guest should return success.
	 *
	 * 2. If no pages could be hot-added:
	 *
	 * If the guest returns success, then the host
	 * will not attempt any further hot-add operations. This
	 * signifies a permanent failure.
	 *
	 * If the guest returns failure, then this failure will be
	 * treated as a transient failure and the host may retry the
	 * hot-add operation after some delay.
	 */
	if (resp.page_count > 0)
		resp.result = 1;
	else if (!do_hot_add)
		resp.result = 1;
	else
		resp.result = 0;

	if (!do_hot_add || (resp.page_count == 0))
		pr_err("Memory hot add failed\n");

	dm->state = DM_INITIALIZED;
	resp.hdr.trans_id = atomic_inc_return(&trans_id);
	vmbus_sendpacket(dm->dev->channel, &resp,
			sizeof(struct dm_hot_add_response),
			(unsigned long)NULL,
			VM_PKT_DATA_INBAND, 0);
}

static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
{
	struct dm_info_header *info_hdr;

	info_hdr = (struct dm_info_header *)msg->info;

	switch (info_hdr->type) {
	case INFO_TYPE_MAX_PAGE_CNT:
		if (info_hdr->data_size == sizeof(__u64)) {
			__u64 *max_page_count = (__u64 *)&info_hdr[1];

			pr_info("Max. dynamic memory size: %llu MB\n",
				(*max_page_count) >> (20 - PAGE_SHIFT));
		}

		break;
	default:
		pr_warn("Received Unknown type: %d\n", info_hdr->type);
	}
}

static unsigned long compute_balloon_floor(void)
{
	unsigned long min_pages;
	unsigned long nr_pages = totalram_pages();
#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
	/* Simple continuous piecewiese linear function:
	 *  max MiB -> min MiB  gradient
	 *       0         0
	 *      16        16
	 *      32        24
	 *     128        72    (1/2)
	 *     512       168    (1/4)
	 *    2048       360    (1/8)
	 *    8192       744    (1/16)
	 *   32768      1512	(1/32)
	 */
	if (nr_pages < MB2PAGES(128))
		min_pages = MB2PAGES(8) + (nr_pages >> 1);
	else if (nr_pages < MB2PAGES(512))
		min_pages = MB2PAGES(40) + (nr_pages >> 2);
	else if (nr_pages < MB2PAGES(2048))
		min_pages = MB2PAGES(104) + (nr_pages >> 3);
	else if (nr_pages < MB2PAGES(8192))
		min_pages = MB2PAGES(232) + (nr_pages >> 4);
	else
		min_pages = MB2PAGES(488) + (nr_pages >> 5);
#undef MB2PAGES
	return min_pages;
}

/*
 * Post our status as it relates memory pressure to the
 * host. Host expects the guests to post this status
 * periodically at 1 second intervals.
 *
 * The metrics specified in this protocol are very Windows
 * specific and so we cook up numbers here to convey our memory
 * pressure.
 */

static void post_status(struct hv_dynmem_device *dm)
{
	struct dm_status status;
	unsigned long now = jiffies;
	unsigned long last_post = last_post_time;

	if (pressure_report_delay > 0) {
		--pressure_report_delay;
		return;
	}

	if (!time_after(now, (last_post_time + HZ)))
		return;

	memset(&status, 0, sizeof(struct dm_status));
	status.hdr.type = DM_STATUS_REPORT;
	status.hdr.size = sizeof(struct dm_status);
	status.hdr.trans_id = atomic_inc_return(&trans_id);

	/*
	 * The host expects the guest to report free and committed memory.
	 * Furthermore, the host expects the pressure information to include
	 * the ballooned out pages. For a given amount of memory that we are
	 * managing we need to compute a floor below which we should not
	 * balloon. Compute this and add it to the pressure report.
	 * We also need to report all offline pages (num_pages_added -
	 * num_pages_onlined) as committed to the host, otherwise it can try
	 * asking us to balloon them out.
	 */
	status.num_avail = si_mem_available();
	status.num_committed = vm_memory_committed() +
		dm->num_pages_ballooned +
		(dm->num_pages_added > dm->num_pages_onlined ?
		 dm->num_pages_added - dm->num_pages_onlined : 0) +
		compute_balloon_floor();

	trace_balloon_status(status.num_avail, status.num_committed,
			     vm_memory_committed(), dm->num_pages_ballooned,
			     dm->num_pages_added, dm->num_pages_onlined);
	/*
	 * If our transaction ID is no longer current, just don't
	 * send the status. This can happen if we were interrupted
	 * after we picked our transaction ID.
	 */
	if (status.hdr.trans_id != atomic_read(&trans_id))
		return;

	/*
	 * If the last post time that we sampled has changed,
	 * we have raced, don't post the status.
	 */
	if (last_post != last_post_time)
		return;

	last_post_time = jiffies;
	vmbus_sendpacket(dm->dev->channel, &status,
				sizeof(struct dm_status),
				(unsigned long)NULL,
				VM_PKT_DATA_INBAND, 0);

}

static void free_balloon_pages(struct hv_dynmem_device *dm,
			 union dm_mem_page_range *range_array)
{
	int num_pages = range_array->finfo.page_cnt;
	__u64 start_frame = range_array->finfo.start_page;
	struct page *pg;
	int i;

	for (i = 0; i < num_pages; i++) {
		pg = pfn_to_page(i + start_frame);
		__ClearPageOffline(pg);
		__free_page(pg);
		dm->num_pages_ballooned--;
	}
}



static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
					unsigned int num_pages,
					struct dm_balloon_response *bl_resp,
					int alloc_unit)
{
	unsigned int i, j;
	struct page *pg;

	if (num_pages < alloc_unit)
		return 0;

	for (i = 0; (i * alloc_unit) < num_pages; i++) {
		if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
			PAGE_SIZE)
			return i * alloc_unit;

		/*
		 * We execute this code in a thread context. Furthermore,
		 * we don't want the kernel to try too hard.
		 */
		pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
				__GFP_NOMEMALLOC | __GFP_NOWARN,
				get_order(alloc_unit << PAGE_SHIFT));

		if (!pg)
			return i * alloc_unit;

		dm->num_pages_ballooned += alloc_unit;

		/*
		 * If we allocatted 2M pages; split them so we
		 * can free them in any order we get.
		 */

		if (alloc_unit != 1)
			split_page(pg, get_order(alloc_unit << PAGE_SHIFT));

		/* mark all pages offline */
		for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
			__SetPageOffline(pg + j);

		bl_resp->range_count++;
		bl_resp->range_array[i].finfo.start_page =
			page_to_pfn(pg);
		bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
		bl_resp->hdr.size += sizeof(union dm_mem_page_range);

	}

	return num_pages;
}

static void balloon_up(struct work_struct *dummy)
{
	unsigned int num_pages = dm_device.balloon_wrk.num_pages;
	unsigned int num_ballooned = 0;
	struct dm_balloon_response *bl_resp;
	int alloc_unit;
	int ret;
	bool done = false;
	int i;
	long avail_pages;
	unsigned long floor;

	/* The host balloons pages in 2M granularity. */
	WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0);

	/*
	 * We will attempt 2M allocations. However, if we fail to
	 * allocate 2M chunks, we will go back to 4k allocations.
	 */
	alloc_unit = 512;

	avail_pages = si_mem_available();
	floor = compute_balloon_floor();

	/* Refuse to balloon below the floor, keep the 2M granularity. */
	if (avail_pages < num_pages || avail_pages - num_pages < floor) {
		pr_warn("Balloon request will be partially fulfilled. %s\n",
			avail_pages < num_pages ? "Not enough memory." :
			"Balloon floor reached.");

		num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
		num_pages -= num_pages % PAGES_IN_2M;
	}

	while (!done) {
		memset(balloon_up_send_buffer, 0, PAGE_SIZE);
		bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
		bl_resp->hdr.type = DM_BALLOON_RESPONSE;
		bl_resp->hdr.size = sizeof(struct dm_balloon_response);
		bl_resp->more_pages = 1;

		num_pages -= num_ballooned;
		num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
						    bl_resp, alloc_unit);

		if (alloc_unit != 1 && num_ballooned == 0) {
			alloc_unit = 1;
			continue;
		}

		if (num_ballooned == 0 || num_ballooned == num_pages) {
			pr_debug("Ballooned %u out of %u requested pages.\n",
				num_pages, dm_device.balloon_wrk.num_pages);

			bl_resp->more_pages = 0;
			done = true;
			dm_device.state = DM_INITIALIZED;
		}

		/*
		 * We are pushing a lot of data through the channel;
		 * deal with transient failures caused because of the
		 * lack of space in the ring buffer.
		 */

		do {
			bl_resp->hdr.trans_id = atomic_inc_return(&trans_id);
			ret = vmbus_sendpacket(dm_device.dev->channel,
						bl_resp,
						bl_resp->hdr.size,
						(unsigned long)NULL,
						VM_PKT_DATA_INBAND, 0);

			if (ret == -EAGAIN)
				msleep(20);
			post_status(&dm_device);
		} while (ret == -EAGAIN);

		if (ret) {
			/*
			 * Free up the memory we allocatted.
			 */
			pr_err("Balloon response failed\n");

			for (i = 0; i < bl_resp->range_count; i++)
				free_balloon_pages(&dm_device,
						 &bl_resp->range_array[i]);

			done = true;
		}
	}

}

static void balloon_down(struct hv_dynmem_device *dm,
			struct dm_unballoon_request *req)
{
	union dm_mem_page_range *range_array = req->range_array;
	int range_count = req->range_count;
	struct dm_unballoon_response resp;
	int i;
	unsigned int prev_pages_ballooned = dm->num_pages_ballooned;

	for (i = 0; i < range_count; i++) {
		free_balloon_pages(dm, &range_array[i]);
		complete(&dm_device.config_event);
	}

	pr_debug("Freed %u ballooned pages.\n",
		prev_pages_ballooned - dm->num_pages_ballooned);

	if (req->more_pages == 1)
		return;

	memset(&resp, 0, sizeof(struct dm_unballoon_response));
	resp.hdr.type = DM_UNBALLOON_RESPONSE;
	resp.hdr.trans_id = atomic_inc_return(&trans_id);
	resp.hdr.size = sizeof(struct dm_unballoon_response);

	vmbus_sendpacket(dm_device.dev->channel, &resp,
				sizeof(struct dm_unballoon_response),
				(unsigned long)NULL,
				VM_PKT_DATA_INBAND, 0);

	dm->state = DM_INITIALIZED;
}

static void balloon_onchannelcallback(void *context);

static int dm_thread_func(void *dm_dev)
{
	struct hv_dynmem_device *dm = dm_dev;

	while (!kthread_should_stop()) {
		wait_for_completion_interruptible_timeout(
						&dm_device.config_event, 1*HZ);
		/*
		 * The host expects us to post information on the memory
		 * pressure every second.
		 */
		reinit_completion(&dm_device.config_event);
		post_status(dm);
	}

	return 0;
}


static void version_resp(struct hv_dynmem_device *dm,
			struct dm_version_response *vresp)
{
	struct dm_version_request version_req;
	int ret;

	if (vresp->is_accepted) {
		/*
		 * We are done; wakeup the
		 * context waiting for version
		 * negotiation.
		 */
		complete(&dm->host_event);
		return;
	}
	/*
	 * If there are more versions to try, continue
	 * with negotiations; if not
	 * shutdown the service since we are not able
	 * to negotiate a suitable version number
	 * with the host.
	 */
	if (dm->next_version == 0)
		goto version_error;

	memset(&version_req, 0, sizeof(struct dm_version_request));
	version_req.hdr.type = DM_VERSION_REQUEST;
	version_req.hdr.size = sizeof(struct dm_version_request);
	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
	version_req.version.version = dm->next_version;
	dm->version = version_req.version.version;

	/*
	 * Set the next version to try in case current version fails.
	 * Win7 protocol ought to be the last one to try.
	 */
	switch (version_req.version.version) {
	case DYNMEM_PROTOCOL_VERSION_WIN8:
		dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
		version_req.is_last_attempt = 0;
		break;
	default:
		dm->next_version = 0;
		version_req.is_last_attempt = 1;
	}

	ret = vmbus_sendpacket(dm->dev->channel, &version_req,
				sizeof(struct dm_version_request),
				(unsigned long)NULL,
				VM_PKT_DATA_INBAND, 0);

	if (ret)
		goto version_error;

	return;

version_error:
	dm->state = DM_INIT_ERROR;
	complete(&dm->host_event);
}

static void cap_resp(struct hv_dynmem_device *dm,
			struct dm_capabilities_resp_msg *cap_resp)
{
	if (!cap_resp->is_accepted) {
		pr_err("Capabilities not accepted by host\n");
		dm->state = DM_INIT_ERROR;
	}
	complete(&dm->host_event);
}

static void balloon_onchannelcallback(void *context)
{
	struct hv_device *dev = context;
	u32 recvlen;
	u64 requestid;
	struct dm_message *dm_msg;
	struct dm_header *dm_hdr;
	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
	struct dm_balloon *bal_msg;
	struct dm_hot_add *ha_msg;
	union dm_mem_page_range *ha_pg_range;
	union dm_mem_page_range *ha_region;

	memset(recv_buffer, 0, sizeof(recv_buffer));
	vmbus_recvpacket(dev->channel, recv_buffer,
			 PAGE_SIZE, &recvlen, &requestid);

	if (recvlen > 0) {
		dm_msg = (struct dm_message *)recv_buffer;
		dm_hdr = &dm_msg->hdr;

		switch (dm_hdr->type) {
		case DM_VERSION_RESPONSE:
			version_resp(dm,
				 (struct dm_version_response *)dm_msg);
			break;

		case DM_CAPABILITIES_RESPONSE:
			cap_resp(dm,
				 (struct dm_capabilities_resp_msg *)dm_msg);
			break;

		case DM_BALLOON_REQUEST:
			if (dm->state == DM_BALLOON_UP)
				pr_warn("Currently ballooning\n");
			bal_msg = (struct dm_balloon *)recv_buffer;
			dm->state = DM_BALLOON_UP;
			dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
			schedule_work(&dm_device.balloon_wrk.wrk);
			break;

		case DM_UNBALLOON_REQUEST:
			dm->state = DM_BALLOON_DOWN;
			balloon_down(dm,
				 (struct dm_unballoon_request *)recv_buffer);
			break;

		case DM_MEM_HOT_ADD_REQUEST:
			if (dm->state == DM_HOT_ADD)
				pr_warn("Currently hot-adding\n");
			dm->state = DM_HOT_ADD;
			ha_msg = (struct dm_hot_add *)recv_buffer;
			if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
				/*
				 * This is a normal hot-add request specifying
				 * hot-add memory.
				 */
				dm->host_specified_ha_region = false;
				ha_pg_range = &ha_msg->range;
				dm->ha_wrk.ha_page_range = *ha_pg_range;
				dm->ha_wrk.ha_region_range.page_range = 0;
			} else {
				/*
				 * Host is specifying that we first hot-add
				 * a region and then partially populate this
				 * region.
				 */
				dm->host_specified_ha_region = true;
				ha_pg_range = &ha_msg->range;
				ha_region = &ha_pg_range[1];
				dm->ha_wrk.ha_page_range = *ha_pg_range;
				dm->ha_wrk.ha_region_range = *ha_region;
			}
			schedule_work(&dm_device.ha_wrk.wrk);
			break;

		case DM_INFO_MESSAGE:
			process_info(dm, (struct dm_info_msg *)dm_msg);
			break;

		default:
			pr_warn("Unhandled message: type: %d\n", dm_hdr->type);

		}
	}

}

static int balloon_connect_vsp(struct hv_device *dev)
{
	struct dm_version_request version_req;
	struct dm_capabilities cap_msg;
	unsigned long t;
	int ret;

	ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
			 balloon_onchannelcallback, dev);
	if (ret)
		return ret;

	/*
	 * Initiate the hand shake with the host and negotiate
	 * a version that the host can support. We start with the
	 * highest version number and go down if the host cannot
	 * support it.
	 */
	memset(&version_req, 0, sizeof(struct dm_version_request));
	version_req.hdr.type = DM_VERSION_REQUEST;
	version_req.hdr.size = sizeof(struct dm_version_request);
	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10;
	version_req.is_last_attempt = 0;
	dm_device.version = version_req.version.version;

	ret = vmbus_sendpacket(dev->channel, &version_req,
			       sizeof(struct dm_version_request),
			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
	if (ret)
		goto out;

	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
	if (t == 0) {
		ret = -ETIMEDOUT;
		goto out;
	}

	/*
	 * If we could not negotiate a compatible version with the host
	 * fail the probe function.
	 */
	if (dm_device.state == DM_INIT_ERROR) {
		ret = -EPROTO;
		goto out;
	}

	pr_info("Using Dynamic Memory protocol version %u.%u\n",
		DYNMEM_MAJOR_VERSION(dm_device.version),
		DYNMEM_MINOR_VERSION(dm_device.version));

	/*
	 * Now submit our capabilities to the host.
	 */
	memset(&cap_msg, 0, sizeof(struct dm_capabilities));
	cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
	cap_msg.hdr.size = sizeof(struct dm_capabilities);
	cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);

	cap_msg.caps.cap_bits.balloon = 1;
	cap_msg.caps.cap_bits.hot_add = 1;

	/*
	 * Specify our alignment requirements as it relates
	 * memory hot-add. Specify 128MB alignment.
	 */
	cap_msg.caps.cap_bits.hot_add_alignment = 7;

	/*
	 * Currently the host does not use these
	 * values and we set them to what is done in the
	 * Windows driver.
	 */
	cap_msg.min_page_cnt = 0;
	cap_msg.max_page_number = -1;

	ret = vmbus_sendpacket(dev->channel, &cap_msg,
			       sizeof(struct dm_capabilities),
			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
	if (ret)
		goto out;

	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
	if (t == 0) {
		ret = -ETIMEDOUT;
		goto out;
	}

	/*
	 * If the host does not like our capabilities,
	 * fail the probe function.
	 */
	if (dm_device.state == DM_INIT_ERROR) {
		ret = -EPROTO;
		goto out;
	}

	return 0;
out:
	vmbus_close(dev->channel);
	return ret;
}

static int balloon_probe(struct hv_device *dev,
			 const struct hv_vmbus_device_id *dev_id)
{
	int ret;

#ifdef CONFIG_MEMORY_HOTPLUG
	do_hot_add = hot_add;
#else
	do_hot_add = false;
#endif
	dm_device.dev = dev;
	dm_device.state = DM_INITIALIZING;
	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
	init_completion(&dm_device.host_event);
	init_completion(&dm_device.config_event);
	INIT_LIST_HEAD(&dm_device.ha_region_list);
	spin_lock_init(&dm_device.ha_lock);
	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
	dm_device.host_specified_ha_region = false;

#ifdef CONFIG_MEMORY_HOTPLUG
	set_online_page_callback(&hv_online_page);
	register_memory_notifier(&hv_memory_nb);
#endif

	hv_set_drvdata(dev, &dm_device);

	ret = balloon_connect_vsp(dev);
	if (ret != 0)
		return ret;

	dm_device.state = DM_INITIALIZED;

	dm_device.thread =
		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
	if (IS_ERR(dm_device.thread)) {
		ret = PTR_ERR(dm_device.thread);
		goto probe_error;
	}

	return 0;

probe_error:
	vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
	unregister_memory_notifier(&hv_memory_nb);
	restore_online_page_callback(&hv_online_page);
#endif
	return ret;
}

static int balloon_remove(struct hv_device *dev)
{
	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
	struct hv_hotadd_state *has, *tmp;
	struct hv_hotadd_gap *gap, *tmp_gap;
	unsigned long flags;

	if (dm->num_pages_ballooned != 0)
		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);

	cancel_work_sync(&dm->balloon_wrk.wrk);
	cancel_work_sync(&dm->ha_wrk.wrk);

	kthread_stop(dm->thread);
	vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
	unregister_memory_notifier(&hv_memory_nb);
	restore_online_page_callback(&hv_online_page);
#endif
	spin_lock_irqsave(&dm_device.ha_lock, flags);
	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
		list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
			list_del(&gap->list);
			kfree(gap);
		}
		list_del(&has->list);
		kfree(has);
	}
	spin_unlock_irqrestore(&dm_device.ha_lock, flags);

	return 0;
}

static const struct hv_vmbus_device_id id_table[] = {
	/* Dynamic Memory Class ID */
	/* 525074DC-8985-46e2-8057-A307DC18A502 */
	{ HV_DM_GUID, },
	{ },
};

MODULE_DEVICE_TABLE(vmbus, id_table);

static  struct hv_driver balloon_drv = {
	.name = "hv_balloon",
	.id_table = id_table,
	.probe =  balloon_probe,
	.remove =  balloon_remove,
	.driver = {
		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
	},
};

static int __init init_balloon_drv(void)
{

	return vmbus_driver_register(&balloon_drv);
}

module_init(init_balloon_drv);

MODULE_DESCRIPTION("Hyper-V Balloon");
MODULE_LICENSE("GPL");