[{"data":1,"prerenderedAt":532},["ShallowReactive",2],{"i-kinnu:logo":3,"i-kinnu:origami-folding":8,"tile-science-statistics-for-data-science-advanced-level-features":12,"i-lucide:chevron-right":527,"i-lucide:menu":530},{"left":4,"top":4,"width":5,"height":5,"rotate":4,"vFlip":6,"hFlip":6,"body":7},0,27,false,"\u003Cg fill=\"none\">\u003Cpath d=\"M0.046875 1.05555C0.046875 1.03541 0.048197 1.01579 0.0507438 0.996728C0.0987149 0.438619 0.586845 0 1.18194 0H25.4398C26.451 0 26.9575 1.171 26.2424 1.85585L15.7301 11.9243L1.31574 0.903476C1.17475 0.79568 1.01137 0.761884 0.859586 0.784111L26.2936 25.1441C27.0086 25.829 26.5022 27 25.4909 27H1.18194C0.555061 27 0.046875 26.5133 0.046875 25.9129V1.05555Z\" fill=\"currentColor\"/>\u003C/g>",{"left":4,"top":4,"width":9,"height":10,"rotate":4,"vFlip":6,"hFlip":6,"body":11},1000,236,"\u003Cg fill=\"none\">\u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M193.68 38.2238C195.994 38.2238 197.87 40.0989 197.87 42.412V231.812C197.87 234.125 195.994 236 193.68 236H4.19013C1.87603 236 2.02305e-07 234.125 0 231.812V42.412C-2.02305e-07 40.0989 1.87603 38.2238 4.19013 38.2238H193.68ZM111.76 89.0072C111.685 87.9474 110.572 87.2905 109.608 87.7376L96.8872 93.641C95.7786 94.1554 95.702 95.7016 96.7545 96.3225L101.579 99.167C94.7045 109.365 90.5733 122.892 90.5732 137.642C90.5733 154.323 95.8569 169.439 104.416 179.945C105.301 181.032 106.9 181.196 107.987 180.311C109.075 179.426 109.238 177.828 108.353 176.741C100.621 167.25 95.6522 153.305 95.6521 137.642C95.6522 123.661 99.6138 111.051 105.963 101.754L110.456 104.403C111.508 105.024 112.826 104.21 112.74 102.991L111.76 89.0072ZM9.63194 136.286C9.14864 136.286 8.75684 136.678 8.75684 137.161C8.7569 137.644 9.14868 138.035 9.63194 138.035H17.2161C17.6993 138.035 18.0912 137.644 18.0912 137.161C18.0912 136.678 17.6994 136.286 17.2161 136.286H9.63194ZM22.6813 136.286C22.198 136.286 21.8062 136.678 21.8062 137.161C21.8063 137.644 22.1981 138.035 22.6813 138.035H30.2655C30.7487 138.035 31.1406 137.644 31.1406 137.161C31.1406 136.678 30.7488 136.286 30.2655 136.286H22.6813ZM35.7464 136.286C35.2631 136.286 34.8713 136.678 34.8713 137.161C34.8713 137.644 35.2631 138.035 35.7464 138.035H44.4973C44.9805 138.035 45.3724 137.644 45.3724 137.161C45.3724 136.678 44.9806 136.286 44.4973 136.286H35.7464ZM49.9977 136.286C49.5144 136.286 49.1226 136.678 49.1226 137.161C49.1226 137.644 49.5144 138.035 49.9977 138.035H57.5819C58.0651 138.035 58.4569 137.644 58.457 137.161C58.457 136.678 58.0651 136.286 57.5819 136.286H49.9977ZM63.0783 136.286C62.595 136.286 62.2032 136.678 62.2032 137.161C62.2033 137.644 62.5951 138.035 63.0783 138.035H70.6625C71.1457 138.035 71.5375 137.644 71.5376 137.161C71.5376 136.678 71.1457 136.286 70.6625 136.286H63.0783ZM76.1277 136.286C75.6444 136.286 75.2526 136.678 75.2526 137.161C75.2527 137.644 75.6445 138.035 76.1277 138.035H83.7119C84.1951 138.035 84.5869 137.644 84.587 137.161C84.587 136.678 84.1951 136.286 83.7119 136.286H76.1277ZM102.266 136.286C101.782 136.286 101.39 136.678 101.39 137.161C101.391 137.644 101.782 138.035 102.266 138.035H109.85C110.333 138.035 110.725 137.644 110.725 137.161C110.725 136.678 110.333 136.286 109.85 136.286H102.266ZM115.338 136.286C114.855 136.286 114.463 136.678 114.463 137.161C114.463 137.644 114.855 138.035 115.338 138.035H122.923C123.406 138.035 123.798 137.644 123.798 137.161C123.798 136.678 123.406 136.286 122.923 136.286H115.338ZM128.403 136.286C127.92 136.286 127.528 136.678 127.528 137.161C127.528 137.644 127.92 138.035 128.403 138.035H135.988C136.471 138.035 136.863 137.644 136.863 137.161C136.863 136.678 136.471 136.286 135.988 136.286H128.403ZM141.468 136.286C140.985 136.286 140.593 136.678 140.593 137.161C140.593 137.644 140.985 138.035 141.468 138.035H149.053C149.536 138.035 149.928 137.644 149.928 137.161C149.928 136.678 149.536 136.286 149.053 136.286H141.468ZM154.541 136.286C154.058 136.286 153.666 136.678 153.666 137.161C153.666 137.644 154.058 138.035 154.541 138.035H162.125C162.609 138.035 163 137.644 163.001 137.161C163.001 136.678 162.609 136.286 162.125 136.286H154.541ZM167.614 136.286C167.131 136.286 166.739 136.678 166.739 137.161C166.739 137.644 167.131 138.035 167.614 138.035H175.198C175.681 138.035 176.073 137.644 176.073 137.161C176.073 136.678 175.681 136.286 175.198 136.286H167.614ZM180.671 136.286C180.188 136.286 179.796 136.678 179.796 137.161C179.796 137.644 180.188 138.035 180.671 138.035H188.255C188.739 138.035 189.13 137.644 189.131 137.161C189.131 136.678 188.739 136.286 188.255 136.286H180.671Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M444.85 38.2277C447.164 38.2277 449.04 40.1028 449.04 42.4159V132.928C449.04 135.241 447.164 137.116 444.85 137.116H255.36C253.046 137.116 251.17 135.241 251.17 132.928V42.4159C251.17 40.1028 253.046 38.2277 255.36 38.2277H444.85ZM361.96 125.388C361.618 125.046 361.064 125.046 360.722 125.388L354.534 131.572C354.192 131.914 354.192 132.468 354.534 132.81C354.876 133.151 355.43 133.151 355.772 132.81L361.96 126.624C362.301 126.283 362.301 125.73 361.96 125.388ZM371.047 116.311C370.705 115.969 370.15 115.969 369.809 116.311L364.446 121.671C364.104 122.012 364.104 122.567 364.446 122.908C364.788 123.249 365.342 123.25 365.684 122.908L371.047 117.548C371.388 117.207 371.388 116.652 371.047 116.311ZM380.124 107.246C379.782 106.904 379.227 106.904 378.885 107.246L373.523 112.606C373.181 112.948 373.181 113.502 373.523 113.844C373.864 114.185 374.419 114.185 374.761 113.844L380.124 108.483C380.465 108.142 380.465 107.587 380.124 107.246ZM385.736 65.8841C385.891 64.6727 384.622 63.7845 383.536 64.3434L371.069 70.7636C370.124 71.2504 369.96 72.5334 370.752 73.2424L381.2 82.5938C382.11 83.4081 383.561 82.8672 383.717 81.6557L384.393 76.3725C391.143 77.1933 398.567 80.7709 404.771 86.9711C411.124 93.3213 414.726 100.952 415.43 107.827C415.573 109.221 416.819 110.236 418.214 110.093C419.609 109.95 420.624 108.703 420.481 107.309C419.644 99.1317 415.435 90.4514 408.362 83.3817C401.466 76.489 393.038 72.3185 385.038 71.338L385.736 65.8841ZM389.2 98.1733C388.859 97.8319 388.304 97.8318 387.962 98.1733L382.6 103.534C382.258 103.875 382.258 104.429 382.6 104.771C382.941 105.112 383.496 105.112 383.838 104.771L389.2 99.4108C389.542 99.0693 389.542 98.5149 389.2 98.1733ZM398.262 89.1047C397.92 88.7633 397.365 88.7632 397.024 89.1047L391.661 94.4649C391.319 94.8065 391.319 95.3608 391.661 95.7024C392.002 96.0436 392.557 96.0438 392.899 95.7024L398.262 90.3421C398.603 90.0007 398.603 89.4463 398.262 89.1047ZM416.431 70.9616C416.089 70.6202 415.534 70.6201 415.193 70.9616L409.83 76.3218C409.488 76.6634 409.488 77.2177 409.83 77.5592C410.172 77.9005 410.726 77.9007 411.068 77.5592L416.431 72.199C416.772 71.8575 416.772 71.3032 416.431 70.9616ZM425.508 61.891C425.166 61.5496 424.611 61.5495 424.27 61.891L418.907 67.2512C418.565 67.5928 418.565 68.1471 418.907 68.4887C419.249 68.8299 419.803 68.8301 420.145 68.4887L425.508 63.1284C425.849 62.787 425.849 62.2326 425.508 61.891ZM434.569 52.8146C434.227 52.4731 433.673 52.4731 433.331 52.8146L427.968 58.1748C427.626 58.5163 427.627 59.0706 427.968 59.4122C428.31 59.7534 428.864 59.7537 429.206 59.4122L434.569 54.052C434.91 53.7105 434.91 53.1562 434.569 52.8146ZM443.638 43.7479C443.296 43.4065 442.742 43.4064 442.4 43.7479L437.037 49.1081C436.695 49.4496 436.696 50.004 437.037 50.3455C437.379 50.6868 437.933 50.687 438.275 50.3455L443.638 44.9853C443.98 44.6438 443.979 44.0895 443.638 43.7479Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M684.066 38.2277C687.798 38.2281 689.667 42.7391 687.027 45.3773L596.473 135.889C595.687 136.675 594.621 137.116 593.51 137.116H506.335C504.021 137.116 502.145 135.241 502.145 132.928V42.4159C502.145 40.1028 504.021 38.2277 506.335 38.2277H684.066ZM514.603 124.566C514.261 124.224 513.707 124.224 513.365 124.566L507.178 130.751C506.836 131.093 506.836 131.646 507.178 131.988C507.519 132.329 508.073 132.329 508.415 131.988L514.603 125.803C514.945 125.462 514.945 124.908 514.603 124.566ZM523.689 115.491C523.348 115.15 522.794 115.15 522.452 115.491L517.09 120.852C516.748 121.193 516.748 121.747 517.09 122.088C517.431 122.43 517.985 122.43 518.327 122.088L523.689 116.728C524.031 116.386 524.031 115.833 523.689 115.491ZM532.102 65.8295C530.707 65.6872 529.46 66.7017 529.318 68.0957C529.175 69.4896 530.189 70.7355 531.584 70.8787C538.463 71.5825 546.096 75.1826 552.45 81.5329C558.723 87.8037 562.312 95.3226 563.079 102.13L557.738 102.392C556.518 102.452 555.865 103.855 556.607 104.827L565.115 115.969C565.76 116.814 567.051 116.751 567.611 115.847L574.992 103.928C575.635 102.889 574.848 101.555 573.628 101.615L568.161 101.882C568.161 101.878 568.162 101.874 568.161 101.871C567.324 93.6931 563.114 85.0124 556.041 77.9425C548.968 70.873 540.283 66.6668 532.102 65.8295ZM532.766 106.421C532.425 106.079 531.871 106.079 531.529 106.421L526.166 111.781C525.825 112.123 525.825 112.676 526.166 113.018C526.508 113.359 527.062 113.359 527.403 113.018L532.766 107.657C533.108 107.316 533.108 106.762 532.766 106.421ZM541.843 97.3445C541.501 97.003 540.948 97.003 540.606 97.3445L535.243 102.705C534.901 103.046 534.902 103.6 535.243 103.941C535.585 104.283 536.139 104.283 536.48 103.941L541.843 98.5809C542.185 98.2393 542.185 97.686 541.843 97.3445ZM550.92 88.2778C550.578 87.9363 550.025 87.9363 549.683 88.2778L544.32 93.638C543.978 93.9796 543.978 94.5329 544.32 94.8745C544.662 95.2161 545.215 95.2161 545.557 94.8745L550.92 89.5142C551.262 89.1727 551.262 88.6193 550.92 88.2778ZM569.066 70.1405C568.724 69.799 568.17 69.7991 567.829 70.1405L562.466 75.5008C562.124 75.8423 562.124 76.3956 562.466 76.7372C562.808 77.0788 563.361 77.0788 563.703 76.7372L569.066 71.377C569.407 71.0354 569.407 70.4821 569.066 70.1405ZM578.143 61.0699C577.801 60.7284 577.247 60.7285 576.906 61.0699L571.543 66.4302C571.201 66.7717 571.201 67.3251 571.543 67.6666C571.885 68.0082 572.438 68.0082 572.78 67.6666L578.143 62.3064C578.484 61.9648 578.484 61.4115 578.143 61.0699ZM587.219 51.9896C586.878 51.6481 586.324 51.6481 585.982 51.9896L580.62 57.3498C580.278 57.6914 580.278 58.2447 580.62 58.5863C580.961 58.9279 581.515 58.9279 581.857 58.5863L587.219 53.2261C587.561 52.8845 587.561 52.3312 587.219 51.9896ZM596.288 42.9249C595.947 42.5833 595.392 42.5833 595.05 42.9249L589.689 48.2851C589.347 48.6267 589.347 49.18 589.689 49.5216C590.03 49.863 590.584 49.8631 590.926 49.5216L596.288 44.1613C596.63 43.8198 596.63 43.2664 596.288 42.9249Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M850.814 38.2277C854.547 38.2281 856.416 42.739 853.777 45.3773L763.223 135.889C762.437 136.674 761.371 137.116 760.26 137.116H673.176C669.443 137.116 667.574 132.605 670.213 129.966L760.768 39.4544C761.554 38.6692 762.62 38.2277 763.731 38.2277H850.814ZM761.338 121.8C760.855 121.8 760.463 122.191 760.463 122.674V131.13H762.213V122.674C762.213 122.191 761.821 121.8 761.338 121.8ZM761.338 108.971C760.855 108.971 760.463 109.363 760.463 109.846V118.301H762.213V109.846C762.213 109.363 761.821 108.971 761.338 108.971ZM761.338 96.1402C760.855 96.1406 760.463 96.5321 760.463 97.0149V105.47H762.213V97.0149C762.213 96.532 761.821 96.1404 761.338 96.1402ZM782.263 71.887C781.043 71.951 780.395 73.3571 781.139 74.3257L784.474 78.6631C779.115 82.951 771.242 85.7443 762.35 85.7444C753.366 85.7442 745.421 82.8944 740.059 78.5305C738.972 77.6461 737.373 77.8099 736.488 78.8961C735.602 79.983 735.766 81.582 736.853 82.467C743.231 87.6574 752.348 90.8207 762.35 90.8209C772.209 90.8208 781.205 87.746 787.568 82.6884L790.833 86.9341C791.577 87.9025 793.103 87.6391 793.479 86.4767L797.791 73.138C798.118 72.127 797.33 71.1017 796.268 71.1566L782.263 71.887ZM761.338 70.4847C760.855 70.4851 760.463 70.8767 760.463 71.3594V79.8147H762.213V71.3594C762.213 70.8766 761.821 70.485 761.338 70.4847ZM761.338 57.656C760.855 57.6564 760.463 58.048 760.463 58.5307V66.986H762.213V58.5307C762.213 58.0479 761.821 57.6563 761.338 57.656ZM761.338 44.8293C760.855 44.8297 760.463 45.2212 760.463 45.704V54.1592H762.213V45.704C762.213 45.2211 761.821 44.8295 761.338 44.8293Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M995.759 38.2277C999.53 38.228 1001.42 42.5171 998.752 45.0253L959.55 81.9005L905.796 41.5363C905.271 41.1418 904.662 41.0182 904.096 41.0994L997.485 130.319C1000.15 132.828 998.262 137.116 994.491 137.116H905.298C902.96 137.116 901.065 135.333 901.065 133.134V42.0941C901.065 42.0204 901.07 41.9483 901.079 41.8786C901.258 39.8345 903.079 38.2277 905.298 38.2277H995.759Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M505.873 0C506.657 4.57042e-05 507.307 0.195499 507.823 0.587023C508.338 0.969046 508.596 1.53802 508.596 2.29251C508.596 2.76034 508.467 3.19015 508.209 3.58162C507.951 3.96344 507.497 4.26401 506.848 4.48361V4.54114C507.65 4.67487 508.205 4.96191 508.51 5.4012C508.816 5.83087 508.969 6.31772 508.969 6.86193C508.969 7.74056 508.672 8.41851 508.08 8.89604C507.497 9.38304 506.733 9.62731 505.787 9.62738C504.861 9.62738 504.158 9.42172 503.68 9.0111C503.212 8.60054 502.935 8.08005 502.849 7.44993L503.881 7.10571L503.924 7.24028C504.035 7.54934 504.211 7.82925 504.454 8.07986C504.731 8.36635 505.166 8.50986 505.758 8.50989C506.465 8.50989 506.943 8.32772 507.191 7.9648C507.449 7.6019 507.579 7.20078 507.579 6.7615C507.579 6.2173 507.378 5.80683 506.977 5.52992C506.585 5.25295 505.93 5.10026 505.013 5.07161V4.15402C505.901 4.12537 506.489 3.92484 506.776 3.55237C507.062 3.18009 507.206 2.82242 507.206 2.47876C507.206 1.62801 506.752 1.17539 505.845 1.12237L505.658 1.11749C505.467 1.11752 505.242 1.14605 504.985 1.2033C504.736 1.25105 504.511 1.3274 504.31 1.43245L504.081 2.56457L503.05 2.44951L503.322 0.687461C503.666 0.49653 504.068 0.33454 504.526 0.200875C504.985 0.0671945 505.434 0 505.873 0Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M905.727 2.30616L904.638 2.4066L904.466 1.26083H901.428V3.72497C901.533 3.71544 901.643 3.71034 901.757 3.71034H902.086C902.755 3.71034 903.386 3.78668 903.979 3.93949C904.58 4.09229 905.068 4.38363 905.44 4.8132C905.822 5.23335 906.014 5.84949 906.014 6.66106C906.014 7.64468 905.722 8.38068 905.14 8.86776C904.557 9.36434 903.783 9.6127 902.818 9.61275C901.91 9.61275 901.213 9.40711 900.725 8.99648C900.248 8.59544 899.96 8.08007 899.865 7.44993L900.911 7.10571C901.007 7.49723 901.203 7.8271 901.499 8.09449C901.795 8.37131 902.211 8.50985 902.746 8.50989C903.395 8.50989 903.869 8.33787 904.165 7.99405C904.461 7.65981 904.609 7.22507 904.609 6.69031C904.609 5.87861 904.337 5.3625 903.792 5.14279C903.248 4.91361 902.612 4.79958 901.886 4.79955C901.695 4.79955 901.489 4.80365 901.27 4.8132C901.059 4.82275 900.854 4.83701 900.653 4.85611L900.224 4.44071V0.143343H905.569L905.727 2.30616Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M765.49 6.04576H766.966L766.837 7.14862H765.49V9.48404H764.185V7.14862H759.857L759.713 6.04576L762.909 0.143343H765.49V6.04576ZM760.96 6.04576H764.185V1.26083H763.541L760.96 6.04576Z\"\n    fill=\"currentColor\" />\n  \u003Cpath d=\"M4.80573 6.47481H6.41154V7.60693H1.81068V6.47481H3.50235V1.27546H1.81068V0.143343H4.80573V6.47481Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M254.359 0C255.353 0 256.055 0.239186 256.466 0.716715C256.877 1.18447 257.083 1.68072 257.083 2.20573C257.083 2.85516 256.849 3.44346 256.38 3.96875C255.912 4.49397 255.348 4.96638 254.689 5.38657C254.039 5.79717 253.437 6.15968 252.883 6.47481H256.423L256.538 5.42948L257.599 5.51529L257.426 7.60693H251.407L251.292 6.58987C252.582 5.73032 253.638 4.98523 254.46 4.35489C255.281 3.71509 255.693 3.05632 255.693 2.37832C255.693 1.53787 255.166 1.11749 254.115 1.12237L254.115 1.11749C253.924 1.11754 253.695 1.14604 253.427 1.2033C253.16 1.25104 252.916 1.32238 252.697 1.41783L252.467 2.47876L251.45 2.3637L251.707 0.60165C252.118 0.401088 252.563 0.253475 253.041 0.15797C253.519 0.0529708 253.958 1.99446e-05 254.359 0Z\"\n    fill=\"currentColor\" />\u003C/g>",{"tile":13,"orbsWithOnlyMarkdownPages":201},{"id":14,"data":15,"type":16,"maxContentLevel":19,"version":20,"orbs":21},"e797a672-deab-425b-8038-85c709acd6d7",{"type":16,"title":17,"tagline":18},9,"Features ","Learn how to manipulate and transform variables for statistics and data science",3,1,[22,80],{"id":23,"data":24,"type":25,"version":20,"maxContentLevel":19,"pages":27},"cd7dcd62-5a31-499e-8b55-bd5012e48060",{"type":25,"title":26},2,"Feature Engineering Techniques",[28,47,65],{"id":29,"data":30,"type":20,"maxContentLevel":19,"version":20,"reviews":34},"0578cec0-0fdc-48c6-86d9-e8359e2d19eb",{"type":20,"title":31,"contentRole":25,"markdownContent":32,"audioMediaId":33},"Feature engineering ","\n\nFeature engineering is the process of finding, creating, and selecting the best data for your model or analysis. This is helpful for statistics and machine learning because using only your raw data might not be optimal for your model’s performance. \n\nJust because you have ‘big data’, that doesn’t mean you have to use it all. It would be kind of like if you were baking: you don’t want to grab all the ingredients in your kitchen and add them to the cake just because you have them there. Also, you might be able to get better results by engineering your features, which is changing or altering them in some way.\n\n ![Graph](image://4abdefeb-abf4-4d7b-966b-b3ac51c1cc21 \" \")\n\nThe ingredients you use will determine whether you end up with some delicious results, or a culinary disaster. So, put your chef hat on – let’s get started! ","64b07283-2520-4367-8f6e-c4d966e7909d",[35],{"id":36,"data":37,"type":38,"version":20,"maxContentLevel":19},"3fe45dea-4352-4293-9abb-d09a7d31fa18",{"type":38,"reviewType":19,"spacingBehaviour":20,"multiChoiceQuestion":39,"multiChoiceCorrect":41,"multiChoiceIncorrect":43},11,[40],"What is the process of finding, creating, and selecting the best data for your model or analysis called?",[42],"Feature engineering",[44,45,46],"Feature selection","Feature analysis","Feature optimization",{"id":48,"data":49,"type":20,"maxContentLevel":19,"version":20,"reviews":53},"e657d70d-0b4c-4421-919f-b4b7c681239f",{"type":20,"title":50,"contentRole":25,"markdownContent":51,"audioMediaId":52},"Ordinal encoding ","\nOrdinal encoding is not something you will have to do every time you run a statistical analysis or create a machine learning model. But, it is helpful to know, because many machine learning models require all inputs to be numeric. Ordinal encoding turns a categorical variable into a numerical one. \n\nWait, what? How is that even possible? Well, it’s a lot simpler than it might sound. For every option you have for your categorical variable, let’s say {‘High School’, ‘College’, ‘Bachelor’s Degree’, ‘Master’s Degree’, ‘PhD’} indicating your observation’s level of education, you create a column named after High School, ‘College’, ‘Bachelor’s Degree’ etc. So where your data previously looked like the table below:\n\n ![Graph](image://2fd065c5-989d-45df-8e69-74da4d945593 \" \")\n\nAfter assigning each category option a number, it looks like this:\n\n ![Graph](image://0f6118ec-f2a9-45eb-a518-2c170941be15 \" \")\n\nOrdinal encoding is simple, and easy to reverse. But, if your data is not ordinal in the first place, it will apply to an ordinal relationship where one does not exist. For example if your variable was instead car color or transport type. In this case, ‘one hot encoding’ might be a more suitable encoding option.\n\n","4316efa2-ce06-4a7c-a723-83ad9311064e",[54],{"id":55,"data":56,"type":38,"version":20,"maxContentLevel":19},"5134b0a0-422a-468c-adfd-5b81714c8548",{"type":38,"reviewType":19,"spacingBehaviour":20,"multiChoiceQuestion":57,"multiChoiceCorrect":59,"multiChoiceIncorrect":61},[58],"What encoding option should be used when the categorical variable is not ordinal?",[60],"One hot encoding",[62,63,64],"Binary encoding","Label encoding","Ordinal encoding",{"id":66,"data":67,"type":20,"maxContentLevel":19,"version":20,"reviews":71},"09771c7d-3792-4a8b-aae8-b2f87901c76c",{"type":20,"title":68,"contentRole":25,"markdownContent":69,"audioMediaId":70},"One hot encoding ","\nA common method used for feature encoding is called ‘one-hot encoding’. It is useful when your data is not ordinal, and you don’t want to use ordinal encoding and introduce ordinality where there is none. Basically, you take your categorical data that looks like this:\n\n ![Graph](image://0a149809-e2c2-49d6-accf-7c93dffd14b1 \" \")\n\nYou then turn it into something like the data in the table below, where 1 equals \"true” and 0 to “false”. Each person now has a numerical value for true or false depending on whether they selected that option as their preferred mode of transport, or not. \n\n\n ![Graph](image://cddfb13c-e783-4372-a501-ed4bf93adeeb \" \")\n\nThis is particularly useful because computers are designed to interpret binary data.\n\n","f3d00555-d274-42f5-9c3e-960bad50240f",[72],{"id":73,"data":74,"type":38,"version":20,"maxContentLevel":19},"a6f455af-418d-4662-bf85-83f2878b8b75",{"type":38,"reviewType":75,"spacingBehaviour":20,"clozeQuestion":76,"clozeWords":78},4,[77],"One-hot encoding is a method used to turn categorical data into binary data.",[79],"binary",{"id":81,"data":82,"type":25,"version":20,"maxContentLevel":19,"pages":84},"1285ca22-b298-4f3e-a32f-dd1f8d3507b1",{"type":25,"title":83},"Feature Scaling Methods",[85,101,119,137,151,166,183],{"id":86,"data":87,"type":20,"maxContentLevel":19,"version":20,"reviews":91},"3f01f61e-73ea-49a2-ab54-46ce80256bad",{"type":20,"title":88,"contentRole":25,"markdownContent":89,"audioMediaId":90},"Feature Scaling ","\nFeature scaling is the process of altering your data in some way through normalization or standardization to achieve uniformity in the shape of your distribution. For example, it allows you to prescribe the minimum values, the maximum values, and the variance. It makes your distribution the same shape. \n\nFeature scaling is useful because whenever distances are used for calculations and conclusions within a machine learning algorithm, there is the possibility that one variable can dominate another due to its sheer scale, rather than importance. Some algorithms that benefit from feature scaling because they use euclidean distance as a measurement for comparison. The Euclidean distance is just the length of a straight line drawn between two points.\n\nFor example, age and salary are measured on very different scales. Age can reach just over 100, while for salary you could have multiple millions. The range of distances possible for one variable are much greater than they are for the other. \n\n ![Graph](image://02929be3-5537-45ba-af07-0e5538132874 \" \")\n\nSo, we feature scale to give every variable a fair chance at influencing results – to show us what really is most important, and were the relationships are. This stops the biggest bully in the dataset having all the say. ","d05057f2-f9ca-4e40-b86e-4733c78b3117",[92],{"id":93,"data":94,"type":38,"version":20,"maxContentLevel":19},"9fa0c8a0-80cb-4e01-87b4-b2f4567a1ef6",{"type":38,"reviewType":25,"spacingBehaviour":20,"binaryQuestion":95,"binaryCorrect":97,"binaryIncorrect":99},[96],"What is the purpose of feature scaling?",[98],"To give every variable a fair chance at influencing results",[100],"To create uniformity in the shape of the distribution",{"id":102,"data":103,"type":20,"maxContentLevel":19,"version":20,"reviews":107},"593132b5-4ad5-42c8-ae17-5830ec87242d",{"type":20,"title":104,"contentRole":25,"markdownContent":105,"audioMediaId":106},"Normalization ","\nNormalizing the values in your distribution rescales them so that they are all between 0 and 1. While previously you might have had income data that ranged from $10,000 to $2,5,000,000, it creates an easier scale. However, by doing this, you will lose the outliers in your dataset. \n\nNormalization is otherwise known as min-max scaling, and by looking at the equation below, you will see why. You use the minimum and maximum values of your variable of interest to normalize each datapoint.\n\n ![Graph](image://eee48109-2b10-49ca-8bed-f980b5b338d5 \"The normalization or min-max scaling equation\")\n","49722eda-9c14-4cc5-b505-0e534962e77c",[108],{"id":109,"data":110,"type":38,"version":20,"maxContentLevel":19},"59ec4f2e-599f-4ca5-a6b2-63a6e3233acb",{"type":38,"reviewType":19,"spacingBehaviour":20,"multiChoiceQuestion":111,"multiChoiceCorrect":113,"multiChoiceIncorrect":115},[112],"What is the other name for the process of normalizing values in a distribution?",[114],"Min-max scaling",[116,117,118],"Standardization","Mean-centering","Z-scoring",{"id":120,"data":121,"type":20,"maxContentLevel":19,"version":20,"reviews":125},"0d9f8a6c-f46e-4cef-a808-2c74a382d6c6",{"type":20,"title":122,"contentRole":25,"markdownContent":123,"audioMediaId":124},"Pros and Cons of Normalization","\nYou should use normalization when your data is not normally distributed, and your model does not make assumptions about the distribution of your data. \n\nThere are some cons to normalization – for example, you will lose your outliers, which may have been important for an understanding of your data. You also lose your original values, they are still there, but on a different scale – so you can’t really interpret the new values in terms of the original measurement variable, like feet or liters. \n\n\n ![Graph](image://c1da599e-bd1d-4b48-8ad0-9c90d9f4ffd8 \"x\")","c4a0730c-8009-494d-a240-3cf4fcf78227",[126],{"id":127,"data":128,"type":38,"version":20,"maxContentLevel":19},"a5bb02ad-f3ee-431b-86b9-877e24ee0f4b",{"type":38,"reviewType":19,"spacingBehaviour":20,"multiChoiceQuestion":129,"multiChoiceCorrect":131,"multiChoiceIncorrect":133},[130],"What are some of the drawbacks to using normalization?",[132],"Loss of outliers and loss of original values",[134,135,136],"Loss of accuracy and loss of precision","Loss of detail and loss of context","Loss of meaning and loss of data",{"id":138,"data":139,"type":20,"maxContentLevel":19,"version":20,"reviews":143},"9daa75ec-bd85-41c7-a69b-3a264efaadb0",{"type":20,"title":140,"contentRole":25,"markdownContent":141,"audioMediaId":142},"Standardization ","\nStandardizing your data rescales your data to conform to the standard normal distribution. That means it will have a mean of 0 and a standard deviation of 1. It’s useful when the model you intend to use requires that your data be normally distributed and have similar scales. We do this by subtracting the sample mean from each datapoint, and dividing that by the standard deviation. \n\n ![Graph](image://67b82788-e728-44d1-bad1-d09e61ef4d10 \" \")","444378e5-b465-4aa1-ae18-a6e48d34980e",[144],{"id":145,"data":146,"type":38,"version":20,"maxContentLevel":19},"7a39fc8a-44ee-4e92-8f9e-db23f947078f",{"type":38,"reviewType":20,"spacingBehaviour":20,"activeRecallQuestion":147,"activeRecallAnswers":149},[148],"How do we standardize data to conform to the standard normal distribution?",[150],"By subtracting the sample mean from each datapoint, and dividing that by the standard deviation",{"id":152,"data":153,"type":20,"maxContentLevel":19,"version":20,"reviews":157},"8a3821f5-b439-4288-ad07-108cd101663f",{"type":20,"title":154,"contentRole":25,"markdownContent":155,"audioMediaId":156},"Pros and Cons of Standardization ","\nYou should also use standardization as opposed to normalization when your data is normally distributed, or has outliers. This is because with normalization, you will lose your outliers.\n\nThere are some cons of standardiazation – for example, you lose your original values. They are still there, but on a different scale – so you can’t really interpret the new values in terms of the original measurement variable, like centimetres or dollars. \n\nStandardization should also be used if you plan to do statistical tests like the Analysis of Variance – ANOVA, and use models like regularized linear and logistic regression which assume that your residuals – the distances between your line of best fit and your values – are normally distributed. \n\n\n ![Graph](image://0ceb44e9-8a41-4b38-843b-2f05919c498a \"x\")","2c6306e4-6680-47fe-ad7d-95978fd7bda4",[158],{"id":159,"data":160,"type":38,"version":20,"maxContentLevel":19},"ae52c729-8f1e-440e-854c-e57bf38eb16a",{"type":38,"reviewType":25,"spacingBehaviour":20,"binaryQuestion":161,"binaryCorrect":163,"binaryIncorrect":164},[162],"What should be used when data is normally distributed or has outliers?",[116],[165],"Normalization",{"id":167,"data":168,"type":20,"maxContentLevel":19,"version":20,"reviews":172},"ed686274-6d28-48ee-8edf-d5daee4ae838",{"type":20,"title":169,"contentRole":25,"markdownContent":170,"audioMediaId":171},"Which models / algorithms need feature scaling ","\nThere are many models that work by computing distances between data points – if the scales used vary then the results obtained from these models won’t be accurate. This is why we scale our data via normalization or standardization to create uniformity between variables. \n\nSome examples of models that rely on computing distance include: K Nearest Neighbors – KNN – which is a supervised machine learning algorithm that classifies/categorizes new data based on its distance to existing data clusters for which we already know the category. \n\nSupport Vector Machines – SVM – which are also a supervised algorithm that uses distance to separate, group, and classify data points. And finally, K-means clustering –  which is an unsupervised machine learning algorithm, meaning you don’t need to have labelled data, it will find patterns in the data for you based on distances. \n\nOther examples of algorithms sensitive to variables with different ranges include dimension reduction algorithms such as Principal Components Analysis. \n\n\n ![Graph](image://2f1b9978-bae9-4b46-a2b6-9180090eba91 \"x\")","c7b8df35-7195-481e-8295-f80d712dbdf3",[173],{"id":174,"data":175,"type":38,"version":20,"maxContentLevel":19},"bbb10bf6-a4d4-45ce-9ba2-8ec97dcda7de",{"type":38,"reviewType":19,"spacingBehaviour":20,"multiChoiceQuestion":176,"multiChoiceCorrect":178,"multiChoiceIncorrect":180},[177],"What is the process of making variables have the same range called?",[179],"Feature scaling",[181,45,182],"Feature normalization","Feature reduction",{"id":184,"data":185,"type":20,"maxContentLevel":19,"version":20,"reviews":189},"a1d29c9e-43b3-4482-a4ec-c57f1814d729",{"type":20,"title":186,"contentRole":25,"markdownContent":187,"audioMediaId":188},"Dummy Encoding ","\nDummy encoding is used for regression models that are used to make predictions about one value based on another value, when one of those values is a categorical variable. For example, you could predict your exam score, which is a continuous variable, based on your favorite Kinnu tile, which is a categorical variable. This is because without dummy encoding, the correlation coefficient for the model cannot be calculated.\n\nDue to the fact that a regression analysis requires a numerical value as the input – we need to transform our categorical variable to an integer. Dummy encoding enables us to do that. For example, let’s say our data looks like this:\n\n ![Graph](image://ae504abe-393a-4ed4-94c6-ded3ddc928e9 \" \")\n\nOnce dummy encoding has been performed on the data above, our data will now be represented numerically like so:\n\n\n ![Graph](image://82c8d8db-3198-4200-b8c1-aae7b6f20cb6 \" \")\n\nBut, you might notice that one of our options is missing. Where did ‘Private Jet’ go? No, it didn’t take off to the Maldives! It went missing because if all fields equal 0 for the other three columns, we know that the value for ‘Private Jet’ must be ‘true’ – 1. This is the case with Elon in our data above. It is this step that is crucial for enabling the calculation of correlation coefficients in regression models. \n\n","25d2d67b-f3d5-457b-8385-73e8794bf8f7",[190],{"id":191,"data":192,"type":38,"version":20,"maxContentLevel":19},"5b29a227-75dd-4c30-ba81-54c62aeb2c4e",{"type":38,"reviewType":19,"spacingBehaviour":20,"multiChoiceQuestion":193,"multiChoiceCorrect":195,"multiChoiceIncorrect":197},[194],"What is the purpose of dummy encoding in a regression model?",[196],"To transform a categorical variable to an integer",[198,199,200],"To calculate correlation coefficients","To enable the calculation of correlation coefficients","To represent data numerically",[202,318],{"id":23,"data":24,"type":25,"version":20,"maxContentLevel":19,"pages":203},[204,241,282],{"id":29,"data":30,"type":20,"maxContentLevel":19,"version":20,"reviews":34,"parsed":205},{"data":206,"body":209,"toc":239},{"title":207,"description":208},"","Feature engineering is the process of finding, creating, and selecting the best data for your model or analysis. This is helpful for statistics and machine learning because using only your raw data might not be optimal for your model’s performance.",{"type":210,"children":211},"root",[212,219,224,234],{"type":213,"tag":214,"props":215,"children":216},"element","p",{},[217],{"type":218,"value":208},"text",{"type":213,"tag":214,"props":220,"children":221},{},[222],{"type":218,"value":223},"Just because you have ‘big data’, that doesn’t mean you have to use it all. It would be kind of like if you were baking: you don’t want to grab all the ingredients in your kitchen and add them to the cake just because you have them there. Also, you might be able to get better results by engineering your features, which is changing or altering them in some way.",{"type":213,"tag":214,"props":225,"children":226},{},[227],{"type":213,"tag":228,"props":229,"children":233},"img",{"alt":230,"src":231,"title":232},"Graph","image://4abdefeb-abf4-4d7b-966b-b3ac51c1cc21"," ",[],{"type":213,"tag":214,"props":235,"children":236},{},[237],{"type":218,"value":238},"The ingredients you use will determine whether you end up with some delicious results, or a culinary disaster. So, put your chef hat on – let’s get started!",{"title":207,"searchDepth":25,"depth":25,"links":240},[],{"id":48,"data":49,"type":20,"maxContentLevel":19,"version":20,"reviews":53,"parsed":242},{"data":243,"body":245,"toc":280},{"title":207,"description":244},"Ordinal encoding is not something you will have to do every time you run a statistical analysis or create a machine learning model. But, it is helpful to know, because many machine learning models require all inputs to be numeric. Ordinal encoding turns a categorical variable into a numerical one.",{"type":210,"children":246},[247,251,256,263,268,275],{"type":213,"tag":214,"props":248,"children":249},{},[250],{"type":218,"value":244},{"type":213,"tag":214,"props":252,"children":253},{},[254],{"type":218,"value":255},"Wait, what? How is that even possible? Well, it’s a lot simpler than it might sound. For every option you have for your categorical variable, let’s say {‘High School’, ‘College’, ‘Bachelor’s Degree’, ‘Master’s Degree’, ‘PhD’} indicating your observation’s level of education, you create a column named after High School, ‘College’, ‘Bachelor’s Degree’ etc. So where your data previously looked like the table below:",{"type":213,"tag":214,"props":257,"children":258},{},[259],{"type":213,"tag":228,"props":260,"children":262},{"alt":230,"src":261,"title":232},"image://2fd065c5-989d-45df-8e69-74da4d945593",[],{"type":213,"tag":214,"props":264,"children":265},{},[266],{"type":218,"value":267},"After assigning each category option a number, it looks like this:",{"type":213,"tag":214,"props":269,"children":270},{},[271],{"type":213,"tag":228,"props":272,"children":274},{"alt":230,"src":273,"title":232},"image://0f6118ec-f2a9-45eb-a518-2c170941be15",[],{"type":213,"tag":214,"props":276,"children":277},{},[278],{"type":218,"value":279},"Ordinal encoding is simple, and easy to reverse. But, if your data is not ordinal in the first place, it will apply to an ordinal relationship where one does not exist. For example if your variable was instead car color or transport type. In this case, ‘one hot encoding’ might be a more suitable encoding option.",{"title":207,"searchDepth":25,"depth":25,"links":281},[],{"id":66,"data":67,"type":20,"maxContentLevel":19,"version":20,"reviews":71,"parsed":283},{"data":284,"body":286,"toc":316},{"title":207,"description":285},"A common method used for feature encoding is called ‘one-hot encoding’. It is useful when your data is not ordinal, and you don’t want to use ordinal encoding and introduce ordinality where there is none. Basically, you take your categorical data that looks like this:",{"type":210,"children":287},[288,292,299,304,311],{"type":213,"tag":214,"props":289,"children":290},{},[291],{"type":218,"value":285},{"type":213,"tag":214,"props":293,"children":294},{},[295],{"type":213,"tag":228,"props":296,"children":298},{"alt":230,"src":297,"title":232},"image://0a149809-e2c2-49d6-accf-7c93dffd14b1",[],{"type":213,"tag":214,"props":300,"children":301},{},[302],{"type":218,"value":303},"You then turn it into something like the data in the table below, where 1 equals \"true” and 0 to “false”. Each person now has a numerical value for true or false depending on whether they selected that option as their preferred mode of transport, or not.",{"type":213,"tag":214,"props":305,"children":306},{},[307],{"type":213,"tag":228,"props":308,"children":310},{"alt":230,"src":309,"title":232},"image://cddfb13c-e783-4372-a501-ed4bf93adeeb",[],{"type":213,"tag":214,"props":312,"children":313},{},[314],{"type":218,"value":315},"This is particularly useful because computers are designed to interpret binary data.",{"title":207,"searchDepth":25,"depth":25,"links":317},[],{"id":81,"data":82,"type":25,"version":20,"maxContentLevel":19,"pages":319},[320,354,379,404,423,452,486],{"id":86,"data":87,"type":20,"maxContentLevel":19,"version":20,"reviews":91,"parsed":321},{"data":322,"body":324,"toc":352},{"title":207,"description":323},"Feature scaling is the process of altering your data in some way through normalization or standardization to achieve uniformity in the shape of your distribution. For example, it allows you to prescribe the minimum values, the maximum values, and the variance. It makes your distribution the same shape.",{"type":210,"children":325},[326,330,335,340,347],{"type":213,"tag":214,"props":327,"children":328},{},[329],{"type":218,"value":323},{"type":213,"tag":214,"props":331,"children":332},{},[333],{"type":218,"value":334},"Feature scaling is useful because whenever distances are used for calculations and conclusions within a machine learning algorithm, there is the possibility that one variable can dominate another due to its sheer scale, rather than importance. Some algorithms that benefit from feature scaling because they use euclidean distance as a measurement for comparison. The Euclidean distance is just the length of a straight line drawn between two points.",{"type":213,"tag":214,"props":336,"children":337},{},[338],{"type":218,"value":339},"For example, age and salary are measured on very different scales. Age can reach just over 100, while for salary you could have multiple millions. The range of distances possible for one variable are much greater than they are for the other.",{"type":213,"tag":214,"props":341,"children":342},{},[343],{"type":213,"tag":228,"props":344,"children":346},{"alt":230,"src":345,"title":232},"image://02929be3-5537-45ba-af07-0e5538132874",[],{"type":213,"tag":214,"props":348,"children":349},{},[350],{"type":218,"value":351},"So, we feature scale to give every variable a fair chance at influencing results – to show us what really is most important, and were the relationships are. This stops the biggest bully in the dataset having all the say.",{"title":207,"searchDepth":25,"depth":25,"links":353},[],{"id":102,"data":103,"type":20,"maxContentLevel":19,"version":20,"reviews":107,"parsed":355},{"data":356,"body":358,"toc":377},{"title":207,"description":357},"Normalizing the values in your distribution rescales them so that they are all between 0 and 1. While previously you might have had income data that ranged from $10,000 to $2,5,000,000, it creates an easier scale. However, by doing this, you will lose the outliers in your dataset.",{"type":210,"children":359},[360,364,369],{"type":213,"tag":214,"props":361,"children":362},{},[363],{"type":218,"value":357},{"type":213,"tag":214,"props":365,"children":366},{},[367],{"type":218,"value":368},"Normalization is otherwise known as min-max scaling, and by looking at the equation below, you will see why. You use the minimum and maximum values of your variable of interest to normalize each datapoint.",{"type":213,"tag":214,"props":370,"children":371},{},[372],{"type":213,"tag":228,"props":373,"children":376},{"alt":230,"src":374,"title":375},"image://eee48109-2b10-49ca-8bed-f980b5b338d5","The normalization or min-max scaling equation",[],{"title":207,"searchDepth":25,"depth":25,"links":378},[],{"id":120,"data":121,"type":20,"maxContentLevel":19,"version":20,"reviews":125,"parsed":380},{"data":381,"body":383,"toc":402},{"title":207,"description":382},"You should use normalization when your data is not normally distributed, and your model does not make assumptions about the distribution of your data.",{"type":210,"children":384},[385,389,394],{"type":213,"tag":214,"props":386,"children":387},{},[388],{"type":218,"value":382},{"type":213,"tag":214,"props":390,"children":391},{},[392],{"type":218,"value":393},"There are some cons to normalization – for example, you will lose your outliers, which may have been important for an understanding of your data. You also lose your original values, they are still there, but on a different scale – so you can’t really interpret the new values in terms of the original measurement variable, like feet or liters.",{"type":213,"tag":214,"props":395,"children":396},{},[397],{"type":213,"tag":228,"props":398,"children":401},{"alt":230,"src":399,"title":400},"image://c1da599e-bd1d-4b48-8ad0-9c90d9f4ffd8","x",[],{"title":207,"searchDepth":25,"depth":25,"links":403},[],{"id":138,"data":139,"type":20,"maxContentLevel":19,"version":20,"reviews":143,"parsed":405},{"data":406,"body":408,"toc":421},{"title":207,"description":407},"Standardizing your data rescales your data to conform to the standard normal distribution. That means it will have a mean of 0 and a standard deviation of 1. It’s useful when the model you intend to use requires that your data be normally distributed and have similar scales. We do this by subtracting the sample mean from each datapoint, and dividing that by the standard deviation.",{"type":210,"children":409},[410,414],{"type":213,"tag":214,"props":411,"children":412},{},[413],{"type":218,"value":407},{"type":213,"tag":214,"props":415,"children":416},{},[417],{"type":213,"tag":228,"props":418,"children":420},{"alt":230,"src":419,"title":232},"image://67b82788-e728-44d1-bad1-d09e61ef4d10",[],{"title":207,"searchDepth":25,"depth":25,"links":422},[],{"id":152,"data":153,"type":20,"maxContentLevel":19,"version":20,"reviews":157,"parsed":424},{"data":425,"body":427,"toc":450},{"title":207,"description":426},"You should also use standardization as opposed to normalization when your data is normally distributed, or has outliers. This is because with normalization, you will lose your outliers.",{"type":210,"children":428},[429,433,438,443],{"type":213,"tag":214,"props":430,"children":431},{},[432],{"type":218,"value":426},{"type":213,"tag":214,"props":434,"children":435},{},[436],{"type":218,"value":437},"There are some cons of standardiazation – for example, you lose your original values. They are still there, but on a different scale – so you can’t really interpret the new values in terms of the original measurement variable, like centimetres or dollars.",{"type":213,"tag":214,"props":439,"children":440},{},[441],{"type":218,"value":442},"Standardization should also be used if you plan to do statistical tests like the Analysis of Variance – ANOVA, and use models like regularized linear and logistic regression which assume that your residuals – the distances between your line of best fit and your values – are normally distributed.",{"type":213,"tag":214,"props":444,"children":445},{},[446],{"type":213,"tag":228,"props":447,"children":449},{"alt":230,"src":448,"title":400},"image://0ceb44e9-8a41-4b38-843b-2f05919c498a",[],{"title":207,"searchDepth":25,"depth":25,"links":451},[],{"id":167,"data":168,"type":20,"maxContentLevel":19,"version":20,"reviews":172,"parsed":453},{"data":454,"body":456,"toc":484},{"title":207,"description":455},"There are many models that work by computing distances between data points – if the scales used vary then the results obtained from these models won’t be accurate. This is why we scale our data via normalization or standardization to create uniformity between variables.",{"type":210,"children":457},[458,462,467,472,477],{"type":213,"tag":214,"props":459,"children":460},{},[461],{"type":218,"value":455},{"type":213,"tag":214,"props":463,"children":464},{},[465],{"type":218,"value":466},"Some examples of models that rely on computing distance include: K Nearest Neighbors – KNN – which is a supervised machine learning algorithm that classifies/categorizes new data based on its distance to existing data clusters for which we already know the category.",{"type":213,"tag":214,"props":468,"children":469},{},[470],{"type":218,"value":471},"Support Vector Machines – SVM – which are also a supervised algorithm that uses distance to separate, group, and classify data points. And finally, K-means clustering –  which is an unsupervised machine learning algorithm, meaning you don’t need to have labelled data, it will find patterns in the data for you based on distances.",{"type":213,"tag":214,"props":473,"children":474},{},[475],{"type":218,"value":476},"Other examples of algorithms sensitive to variables with different ranges include dimension reduction algorithms such as Principal Components Analysis.",{"type":213,"tag":214,"props":478,"children":479},{},[480],{"type":213,"tag":228,"props":481,"children":483},{"alt":230,"src":482,"title":400},"image://2f1b9978-bae9-4b46-a2b6-9180090eba91",[],{"title":207,"searchDepth":25,"depth":25,"links":485},[],{"id":184,"data":185,"type":20,"maxContentLevel":19,"version":20,"reviews":189,"parsed":487},{"data":488,"body":490,"toc":525},{"title":207,"description":489},"Dummy encoding is used for regression models that are used to make predictions about one value based on another value, when one of those values is a categorical variable. For example, you could predict your exam score, which is a continuous variable, based on your favorite Kinnu tile, which is a categorical variable. This is because without dummy encoding, the correlation coefficient for the model cannot be calculated.",{"type":210,"children":491},[492,496,501,508,513,520],{"type":213,"tag":214,"props":493,"children":494},{},[495],{"type":218,"value":489},{"type":213,"tag":214,"props":497,"children":498},{},[499],{"type":218,"value":500},"Due to the fact that a regression analysis requires a numerical value as the input – we need to transform our categorical variable to an integer. Dummy encoding enables us to do that. For example, let’s say our data looks like this:",{"type":213,"tag":214,"props":502,"children":503},{},[504],{"type":213,"tag":228,"props":505,"children":507},{"alt":230,"src":506,"title":232},"image://ae504abe-393a-4ed4-94c6-ded3ddc928e9",[],{"type":213,"tag":214,"props":509,"children":510},{},[511],{"type":218,"value":512},"Once dummy encoding has been performed on the data above, our data will now be represented numerically like so:",{"type":213,"tag":214,"props":514,"children":515},{},[516],{"type":213,"tag":228,"props":517,"children":519},{"alt":230,"src":518,"title":232},"image://82c8d8db-3198-4200-b8c1-aae7b6f20cb6",[],{"type":213,"tag":214,"props":521,"children":522},{},[523],{"type":218,"value":524},"But, you might notice that one of our options is missing. Where did ‘Private Jet’ go? No, it didn’t take off to the Maldives! It went missing because if all fields equal 0 for the other three columns, we know that the value for ‘Private Jet’ must be ‘true’ – 1. This is the case with Elon in our data above. It is this step that is crucial for enabling the calculation of correlation coefficients in regression models.",{"title":207,"searchDepth":25,"depth":25,"links":526},[],{"left":4,"top":4,"width":528,"height":528,"rotate":4,"vFlip":6,"hFlip":6,"body":529},24,"\u003Cpath fill=\"none\" stroke=\"currentColor\" stroke-linecap=\"round\" stroke-linejoin=\"round\" stroke-width=\"2\" d=\"m9 18l6-6l-6-6\"/>",{"left":4,"top":4,"width":528,"height":528,"rotate":4,"vFlip":6,"hFlip":6,"body":531},"\u003Cpath fill=\"none\" stroke=\"currentColor\" stroke-linecap=\"round\" stroke-linejoin=\"round\" stroke-width=\"2\" d=\"M4 5h16M4 12h16M4 19h16\"/>",1778179417976]