[{"data":1,"prerenderedAt":1245},["ShallowReactive",2],{"i-kinnu:logo":3,"i-kinnu:origami-folding":8,"pathway-science-statistics-for-data-science-advanced-level":12,"i-lucide:chevron-right":1240,"i-lucide:tag":1243},{"left":4,"top":4,"width":5,"height":5,"rotate":4,"vFlip":6,"hFlip":6,"body":7},0,27,false,"\u003Cg fill=\"none\">\u003Cpath d=\"M0.046875 1.05555C0.046875 1.03541 0.048197 1.01579 0.0507438 0.996728C0.0987149 0.438619 0.586845 0 1.18194 0H25.4398C26.451 0 26.9575 1.171 26.2424 1.85585L15.7301 11.9243L1.31574 0.903476C1.17475 0.79568 1.01137 0.761884 0.859586 0.784111L26.2936 25.1441C27.0086 25.829 26.5022 27 25.4909 27H1.18194C0.555061 27 0.046875 26.5133 0.046875 25.9129V1.05555Z\" fill=\"currentColor\"/>\u003C/g>",{"left":4,"top":4,"width":9,"height":10,"rotate":4,"vFlip":6,"hFlip":6,"body":11},1000,236,"\u003Cg fill=\"none\">\u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M193.68 38.2238C195.994 38.2238 197.87 40.0989 197.87 42.412V231.812C197.87 234.125 195.994 236 193.68 236H4.19013C1.87603 236 2.02305e-07 234.125 0 231.812V42.412C-2.02305e-07 40.0989 1.87603 38.2238 4.19013 38.2238H193.68ZM111.76 89.0072C111.685 87.9474 110.572 87.2905 109.608 87.7376L96.8872 93.641C95.7786 94.1554 95.702 95.7016 96.7545 96.3225L101.579 99.167C94.7045 109.365 90.5733 122.892 90.5732 137.642C90.5733 154.323 95.8569 169.439 104.416 179.945C105.301 181.032 106.9 181.196 107.987 180.311C109.075 179.426 109.238 177.828 108.353 176.741C100.621 167.25 95.6522 153.305 95.6521 137.642C95.6522 123.661 99.6138 111.051 105.963 101.754L110.456 104.403C111.508 105.024 112.826 104.21 112.74 102.991L111.76 89.0072ZM9.63194 136.286C9.14864 136.286 8.75684 136.678 8.75684 137.161C8.7569 137.644 9.14868 138.035 9.63194 138.035H17.2161C17.6993 138.035 18.0912 137.644 18.0912 137.161C18.0912 136.678 17.6994 136.286 17.2161 136.286H9.63194ZM22.6813 136.286C22.198 136.286 21.8062 136.678 21.8062 137.161C21.8063 137.644 22.1981 138.035 22.6813 138.035H30.2655C30.7487 138.035 31.1406 137.644 31.1406 137.161C31.1406 136.678 30.7488 136.286 30.2655 136.286H22.6813ZM35.7464 136.286C35.2631 136.286 34.8713 136.678 34.8713 137.161C34.8713 137.644 35.2631 138.035 35.7464 138.035H44.4973C44.9805 138.035 45.3724 137.644 45.3724 137.161C45.3724 136.678 44.9806 136.286 44.4973 136.286H35.7464ZM49.9977 136.286C49.5144 136.286 49.1226 136.678 49.1226 137.161C49.1226 137.644 49.5144 138.035 49.9977 138.035H57.5819C58.0651 138.035 58.4569 137.644 58.457 137.161C58.457 136.678 58.0651 136.286 57.5819 136.286H49.9977ZM63.0783 136.286C62.595 136.286 62.2032 136.678 62.2032 137.161C62.2033 137.644 62.5951 138.035 63.0783 138.035H70.6625C71.1457 138.035 71.5375 137.644 71.5376 137.161C71.5376 136.678 71.1457 136.286 70.6625 136.286H63.0783ZM76.1277 136.286C75.6444 136.286 75.2526 136.678 75.2526 137.161C75.2527 137.644 75.6445 138.035 76.1277 138.035H83.7119C84.1951 138.035 84.5869 137.644 84.587 137.161C84.587 136.678 84.1951 136.286 83.7119 136.286H76.1277ZM102.266 136.286C101.782 136.286 101.39 136.678 101.39 137.161C101.391 137.644 101.782 138.035 102.266 138.035H109.85C110.333 138.035 110.725 137.644 110.725 137.161C110.725 136.678 110.333 136.286 109.85 136.286H102.266ZM115.338 136.286C114.855 136.286 114.463 136.678 114.463 137.161C114.463 137.644 114.855 138.035 115.338 138.035H122.923C123.406 138.035 123.798 137.644 123.798 137.161C123.798 136.678 123.406 136.286 122.923 136.286H115.338ZM128.403 136.286C127.92 136.286 127.528 136.678 127.528 137.161C127.528 137.644 127.92 138.035 128.403 138.035H135.988C136.471 138.035 136.863 137.644 136.863 137.161C136.863 136.678 136.471 136.286 135.988 136.286H128.403ZM141.468 136.286C140.985 136.286 140.593 136.678 140.593 137.161C140.593 137.644 140.985 138.035 141.468 138.035H149.053C149.536 138.035 149.928 137.644 149.928 137.161C149.928 136.678 149.536 136.286 149.053 136.286H141.468ZM154.541 136.286C154.058 136.286 153.666 136.678 153.666 137.161C153.666 137.644 154.058 138.035 154.541 138.035H162.125C162.609 138.035 163 137.644 163.001 137.161C163.001 136.678 162.609 136.286 162.125 136.286H154.541ZM167.614 136.286C167.131 136.286 166.739 136.678 166.739 137.161C166.739 137.644 167.131 138.035 167.614 138.035H175.198C175.681 138.035 176.073 137.644 176.073 137.161C176.073 136.678 175.681 136.286 175.198 136.286H167.614ZM180.671 136.286C180.188 136.286 179.796 136.678 179.796 137.161C179.796 137.644 180.188 138.035 180.671 138.035H188.255C188.739 138.035 189.13 137.644 189.131 137.161C189.131 136.678 188.739 136.286 188.255 136.286H180.671Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M444.85 38.2277C447.164 38.2277 449.04 40.1028 449.04 42.4159V132.928C449.04 135.241 447.164 137.116 444.85 137.116H255.36C253.046 137.116 251.17 135.241 251.17 132.928V42.4159C251.17 40.1028 253.046 38.2277 255.36 38.2277H444.85ZM361.96 125.388C361.618 125.046 361.064 125.046 360.722 125.388L354.534 131.572C354.192 131.914 354.192 132.468 354.534 132.81C354.876 133.151 355.43 133.151 355.772 132.81L361.96 126.624C362.301 126.283 362.301 125.73 361.96 125.388ZM371.047 116.311C370.705 115.969 370.15 115.969 369.809 116.311L364.446 121.671C364.104 122.012 364.104 122.567 364.446 122.908C364.788 123.249 365.342 123.25 365.684 122.908L371.047 117.548C371.388 117.207 371.388 116.652 371.047 116.311ZM380.124 107.246C379.782 106.904 379.227 106.904 378.885 107.246L373.523 112.606C373.181 112.948 373.181 113.502 373.523 113.844C373.864 114.185 374.419 114.185 374.761 113.844L380.124 108.483C380.465 108.142 380.465 107.587 380.124 107.246ZM385.736 65.8841C385.891 64.6727 384.622 63.7845 383.536 64.3434L371.069 70.7636C370.124 71.2504 369.96 72.5334 370.752 73.2424L381.2 82.5938C382.11 83.4081 383.561 82.8672 383.717 81.6557L384.393 76.3725C391.143 77.1933 398.567 80.7709 404.771 86.9711C411.124 93.3213 414.726 100.952 415.43 107.827C415.573 109.221 416.819 110.236 418.214 110.093C419.609 109.95 420.624 108.703 420.481 107.309C419.644 99.1317 415.435 90.4514 408.362 83.3817C401.466 76.489 393.038 72.3185 385.038 71.338L385.736 65.8841ZM389.2 98.1733C388.859 97.8319 388.304 97.8318 387.962 98.1733L382.6 103.534C382.258 103.875 382.258 104.429 382.6 104.771C382.941 105.112 383.496 105.112 383.838 104.771L389.2 99.4108C389.542 99.0693 389.542 98.5149 389.2 98.1733ZM398.262 89.1047C397.92 88.7633 397.365 88.7632 397.024 89.1047L391.661 94.4649C391.319 94.8065 391.319 95.3608 391.661 95.7024C392.002 96.0436 392.557 96.0438 392.899 95.7024L398.262 90.3421C398.603 90.0007 398.603 89.4463 398.262 89.1047ZM416.431 70.9616C416.089 70.6202 415.534 70.6201 415.193 70.9616L409.83 76.3218C409.488 76.6634 409.488 77.2177 409.83 77.5592C410.172 77.9005 410.726 77.9007 411.068 77.5592L416.431 72.199C416.772 71.8575 416.772 71.3032 416.431 70.9616ZM425.508 61.891C425.166 61.5496 424.611 61.5495 424.27 61.891L418.907 67.2512C418.565 67.5928 418.565 68.1471 418.907 68.4887C419.249 68.8299 419.803 68.8301 420.145 68.4887L425.508 63.1284C425.849 62.787 425.849 62.2326 425.508 61.891ZM434.569 52.8146C434.227 52.4731 433.673 52.4731 433.331 52.8146L427.968 58.1748C427.626 58.5163 427.627 59.0706 427.968 59.4122C428.31 59.7534 428.864 59.7537 429.206 59.4122L434.569 54.052C434.91 53.7105 434.91 53.1562 434.569 52.8146ZM443.638 43.7479C443.296 43.4065 442.742 43.4064 442.4 43.7479L437.037 49.1081C436.695 49.4496 436.696 50.004 437.037 50.3455C437.379 50.6868 437.933 50.687 438.275 50.3455L443.638 44.9853C443.98 44.6438 443.979 44.0895 443.638 43.7479Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M684.066 38.2277C687.798 38.2281 689.667 42.7391 687.027 45.3773L596.473 135.889C595.687 136.675 594.621 137.116 593.51 137.116H506.335C504.021 137.116 502.145 135.241 502.145 132.928V42.4159C502.145 40.1028 504.021 38.2277 506.335 38.2277H684.066ZM514.603 124.566C514.261 124.224 513.707 124.224 513.365 124.566L507.178 130.751C506.836 131.093 506.836 131.646 507.178 131.988C507.519 132.329 508.073 132.329 508.415 131.988L514.603 125.803C514.945 125.462 514.945 124.908 514.603 124.566ZM523.689 115.491C523.348 115.15 522.794 115.15 522.452 115.491L517.09 120.852C516.748 121.193 516.748 121.747 517.09 122.088C517.431 122.43 517.985 122.43 518.327 122.088L523.689 116.728C524.031 116.386 524.031 115.833 523.689 115.491ZM532.102 65.8295C530.707 65.6872 529.46 66.7017 529.318 68.0957C529.175 69.4896 530.189 70.7355 531.584 70.8787C538.463 71.5825 546.096 75.1826 552.45 81.5329C558.723 87.8037 562.312 95.3226 563.079 102.13L557.738 102.392C556.518 102.452 555.865 103.855 556.607 104.827L565.115 115.969C565.76 116.814 567.051 116.751 567.611 115.847L574.992 103.928C575.635 102.889 574.848 101.555 573.628 101.615L568.161 101.882C568.161 101.878 568.162 101.874 568.161 101.871C567.324 93.6931 563.114 85.0124 556.041 77.9425C548.968 70.873 540.283 66.6668 532.102 65.8295ZM532.766 106.421C532.425 106.079 531.871 106.079 531.529 106.421L526.166 111.781C525.825 112.123 525.825 112.676 526.166 113.018C526.508 113.359 527.062 113.359 527.403 113.018L532.766 107.657C533.108 107.316 533.108 106.762 532.766 106.421ZM541.843 97.3445C541.501 97.003 540.948 97.003 540.606 97.3445L535.243 102.705C534.901 103.046 534.902 103.6 535.243 103.941C535.585 104.283 536.139 104.283 536.48 103.941L541.843 98.5809C542.185 98.2393 542.185 97.686 541.843 97.3445ZM550.92 88.2778C550.578 87.9363 550.025 87.9363 549.683 88.2778L544.32 93.638C543.978 93.9796 543.978 94.5329 544.32 94.8745C544.662 95.2161 545.215 95.2161 545.557 94.8745L550.92 89.5142C551.262 89.1727 551.262 88.6193 550.92 88.2778ZM569.066 70.1405C568.724 69.799 568.17 69.7991 567.829 70.1405L562.466 75.5008C562.124 75.8423 562.124 76.3956 562.466 76.7372C562.808 77.0788 563.361 77.0788 563.703 76.7372L569.066 71.377C569.407 71.0354 569.407 70.4821 569.066 70.1405ZM578.143 61.0699C577.801 60.7284 577.247 60.7285 576.906 61.0699L571.543 66.4302C571.201 66.7717 571.201 67.3251 571.543 67.6666C571.885 68.0082 572.438 68.0082 572.78 67.6666L578.143 62.3064C578.484 61.9648 578.484 61.4115 578.143 61.0699ZM587.219 51.9896C586.878 51.6481 586.324 51.6481 585.982 51.9896L580.62 57.3498C580.278 57.6914 580.278 58.2447 580.62 58.5863C580.961 58.9279 581.515 58.9279 581.857 58.5863L587.219 53.2261C587.561 52.8845 587.561 52.3312 587.219 51.9896ZM596.288 42.9249C595.947 42.5833 595.392 42.5833 595.05 42.9249L589.689 48.2851C589.347 48.6267 589.347 49.18 589.689 49.5216C590.03 49.863 590.584 49.8631 590.926 49.5216L596.288 44.1613C596.63 43.8198 596.63 43.2664 596.288 42.9249Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M850.814 38.2277C854.547 38.2281 856.416 42.739 853.777 45.3773L763.223 135.889C762.437 136.674 761.371 137.116 760.26 137.116H673.176C669.443 137.116 667.574 132.605 670.213 129.966L760.768 39.4544C761.554 38.6692 762.62 38.2277 763.731 38.2277H850.814ZM761.338 121.8C760.855 121.8 760.463 122.191 760.463 122.674V131.13H762.213V122.674C762.213 122.191 761.821 121.8 761.338 121.8ZM761.338 108.971C760.855 108.971 760.463 109.363 760.463 109.846V118.301H762.213V109.846C762.213 109.363 761.821 108.971 761.338 108.971ZM761.338 96.1402C760.855 96.1406 760.463 96.5321 760.463 97.0149V105.47H762.213V97.0149C762.213 96.532 761.821 96.1404 761.338 96.1402ZM782.263 71.887C781.043 71.951 780.395 73.3571 781.139 74.3257L784.474 78.6631C779.115 82.951 771.242 85.7443 762.35 85.7444C753.366 85.7442 745.421 82.8944 740.059 78.5305C738.972 77.6461 737.373 77.8099 736.488 78.8961C735.602 79.983 735.766 81.582 736.853 82.467C743.231 87.6574 752.348 90.8207 762.35 90.8209C772.209 90.8208 781.205 87.746 787.568 82.6884L790.833 86.9341C791.577 87.9025 793.103 87.6391 793.479 86.4767L797.791 73.138C798.118 72.127 797.33 71.1017 796.268 71.1566L782.263 71.887ZM761.338 70.4847C760.855 70.4851 760.463 70.8767 760.463 71.3594V79.8147H762.213V71.3594C762.213 70.8766 761.821 70.485 761.338 70.4847ZM761.338 57.656C760.855 57.6564 760.463 58.048 760.463 58.5307V66.986H762.213V58.5307C762.213 58.0479 761.821 57.6563 761.338 57.656ZM761.338 44.8293C760.855 44.8297 760.463 45.2212 760.463 45.704V54.1592H762.213V45.704C762.213 45.2211 761.821 44.8295 761.338 44.8293Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M995.759 38.2277C999.53 38.228 1001.42 42.5171 998.752 45.0253L959.55 81.9005L905.796 41.5363C905.271 41.1418 904.662 41.0182 904.096 41.0994L997.485 130.319C1000.15 132.828 998.262 137.116 994.491 137.116H905.298C902.96 137.116 901.065 135.333 901.065 133.134V42.0941C901.065 42.0204 901.07 41.9483 901.079 41.8786C901.258 39.8345 903.079 38.2277 905.298 38.2277H995.759Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M505.873 0C506.657 4.57042e-05 507.307 0.195499 507.823 0.587023C508.338 0.969046 508.596 1.53802 508.596 2.29251C508.596 2.76034 508.467 3.19015 508.209 3.58162C507.951 3.96344 507.497 4.26401 506.848 4.48361V4.54114C507.65 4.67487 508.205 4.96191 508.51 5.4012C508.816 5.83087 508.969 6.31772 508.969 6.86193C508.969 7.74056 508.672 8.41851 508.08 8.89604C507.497 9.38304 506.733 9.62731 505.787 9.62738C504.861 9.62738 504.158 9.42172 503.68 9.0111C503.212 8.60054 502.935 8.08005 502.849 7.44993L503.881 7.10571L503.924 7.24028C504.035 7.54934 504.211 7.82925 504.454 8.07986C504.731 8.36635 505.166 8.50986 505.758 8.50989C506.465 8.50989 506.943 8.32772 507.191 7.9648C507.449 7.6019 507.579 7.20078 507.579 6.7615C507.579 6.2173 507.378 5.80683 506.977 5.52992C506.585 5.25295 505.93 5.10026 505.013 5.07161V4.15402C505.901 4.12537 506.489 3.92484 506.776 3.55237C507.062 3.18009 507.206 2.82242 507.206 2.47876C507.206 1.62801 506.752 1.17539 505.845 1.12237L505.658 1.11749C505.467 1.11752 505.242 1.14605 504.985 1.2033C504.736 1.25105 504.511 1.3274 504.31 1.43245L504.081 2.56457L503.05 2.44951L503.322 0.687461C503.666 0.49653 504.068 0.33454 504.526 0.200875C504.985 0.0671945 505.434 0 505.873 0Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M905.727 2.30616L904.638 2.4066L904.466 1.26083H901.428V3.72497C901.533 3.71544 901.643 3.71034 901.757 3.71034H902.086C902.755 3.71034 903.386 3.78668 903.979 3.93949C904.58 4.09229 905.068 4.38363 905.44 4.8132C905.822 5.23335 906.014 5.84949 906.014 6.66106C906.014 7.64468 905.722 8.38068 905.14 8.86776C904.557 9.36434 903.783 9.6127 902.818 9.61275C901.91 9.61275 901.213 9.40711 900.725 8.99648C900.248 8.59544 899.96 8.08007 899.865 7.44993L900.911 7.10571C901.007 7.49723 901.203 7.8271 901.499 8.09449C901.795 8.37131 902.211 8.50985 902.746 8.50989C903.395 8.50989 903.869 8.33787 904.165 7.99405C904.461 7.65981 904.609 7.22507 904.609 6.69031C904.609 5.87861 904.337 5.3625 903.792 5.14279C903.248 4.91361 902.612 4.79958 901.886 4.79955C901.695 4.79955 901.489 4.80365 901.27 4.8132C901.059 4.82275 900.854 4.83701 900.653 4.85611L900.224 4.44071V0.143343H905.569L905.727 2.30616Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M765.49 6.04576H766.966L766.837 7.14862H765.49V9.48404H764.185V7.14862H759.857L759.713 6.04576L762.909 0.143343H765.49V6.04576ZM760.96 6.04576H764.185V1.26083H763.541L760.96 6.04576Z\"\n    fill=\"currentColor\" />\n  \u003Cpath d=\"M4.80573 6.47481H6.41154V7.60693H1.81068V6.47481H3.50235V1.27546H1.81068V0.143343H4.80573V6.47481Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M254.359 0C255.353 0 256.055 0.239186 256.466 0.716715C256.877 1.18447 257.083 1.68072 257.083 2.20573C257.083 2.85516 256.849 3.44346 256.38 3.96875C255.912 4.49397 255.348 4.96638 254.689 5.38657C254.039 5.79717 253.437 6.15968 252.883 6.47481H256.423L256.538 5.42948L257.599 5.51529L257.426 7.60693H251.407L251.292 6.58987C252.582 5.73032 253.638 4.98523 254.46 4.35489C255.281 3.71509 255.693 3.05632 255.693 2.37832C255.693 1.53787 255.166 1.11749 254.115 1.12237L254.115 1.11749C253.924 1.11754 253.695 1.14604 253.427 1.2033C253.16 1.25104 252.916 1.32238 252.697 1.41783L252.467 2.47876L251.45 2.3637L251.707 0.60165C252.118 0.401088 252.563 0.253475 253.041 0.15797C253.519 0.0529708 253.958 1.99446e-05 254.359 0Z\"\n    fill=\"currentColor\" />\u003C/g>",{"id":13,"data":14,"type":15,"maxContentLevel":30,"version":31,"tiles":32},"03ff3cef-d0c9-4aed-bc37-d6b9f5ab2245",{"type":15,"title":16,"tagline":17,"description":17,"featureImageSquare":18,"baseColor":19,"emoji":20,"shapePreference":21,"allowContentSuspension":22,"allowContentEdits":22,"editorsChoice":6,"accreditations":23,"certificatePriceLevel":28,"certificationTitle":29},8,"Statistics for Data Science: Advanced Level","An advanced guide to statistical analysis for data science","ddbb359d-994c-4d88-8dd4-49e2263466e2","#9A8F74","📊",4,true,[24],{"authority":25,"wasCpdTill":26,"previousCpdCreditMinutes":27},1,"2025-12-31T00:00:00Z",120,3,"Statistics for Data Science (Level 3)",9,6,[33,227,474,649,755,937,1098],{"id":34,"data":35,"type":30,"maxContentLevel":28,"version":25,"orbs":38},"b03e15be-046f-406e-8de8-775175894929",{"type":30,"title":36,"tagline":37},"Advanced Properties of Your Data","More complicated distributions and methods of analysis.",[39,92,163],{"id":40,"data":41,"type":42,"version":25,"maxContentLevel":28,"pages":44},"4715626f-da95-4875-b2e3-efe3f1833360",{"type":42,"title":43},2,"Kurtosis and Its Types",[45,61,77],{"id":46,"data":47,"type":25,"maxContentLevel":28,"version":25,"reviews":51},"8c08925d-2867-4c69-8eb1-baa22b9bb8e8",{"type":25,"title":48,"contentRole":42,"markdownContent":49,"audioMediaId":50},"Kurtosis ","Excess kurtosis is a statistical measurement that tells us about the shape of a distribution. It specifically tells us how \"peaked\" or \"flat\" a distribution is compared to a normal distribution. \n\nA normal distribution is a symmetric bell-shaped curve that has an excess kurtosis of 0. It has the same amount of data on both sides of the mean, median, and mode.\n\n ![Graph](image://584bdcf3-fe89-48b8-b97b-a26c567d7833 \"Fat vs. thin-tailed kurtosis\")\n\nIf a distribution has a positive kurtosis, it means that it is more peaked than a normal distribution. This is often referred to as a \"fat-tailed\" distribution because the tails (or extremes) of the distribution are \"fatter\" than in a normal distribution. \n\nOn the other hand, if a distribution has a negative kurtosis, it means that it is more flat than a normal distribution. This is often referred to as a \"thin-tailed\" distribution because the tails are \"thinner\" than in a normal distribution. \n\nIn conclusion, kurtosis can help you identify the shape of a distribution and help you tell if it is fat-tailed, normal-tailed or thin-tailed.\n","044ec0f3-001f-4b83-b046-926338220859",[52],{"id":53,"data":54,"type":55,"version":25,"maxContentLevel":28},"073d9a1e-bca5-4bea-b58e-843f4c2f3d09",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":56,"clozeWords":58},11,[57],"Kurtosis tells us how \"peaked\" or \"flat\" a distribution is compared to a normal distribution.",[59,60],"\"peaked\"","\"flat\"",{"id":62,"data":63,"type":25,"maxContentLevel":28,"version":25,"reviews":67},"31585e89-b4f4-46c7-8a20-62342369846b",{"type":25,"title":64,"contentRole":42,"markdownContent":65,"audioMediaId":66},"Labelling Kurtosis "," ![Graph](image://39a5ba23-d6b2-4a0a-92d9-3d435117dc9e \"Different kinds of kurtorsis\")\n\nIf kurtosis is greater than 3, then the distribution is Leptokurtic. A Leptokurtic distribution has a high peak, declines rapidly as you move away from the mean, and has heavy tails – more outliers. \n\nIf kurtosis is less than 3, then it is Platykurtic. It will have a flatter top – not always as flat as the uniform distribution – and it will be mostly body, no long tails. \n\nAnd what if kurtosis is exactly 3? Then it is Mesokurtic. It has a moderate peak, and it’s best represented by the normal distribution. \n\nFor a real-world application of the interpretation of kurtosis – it is often used as a measure of financial risk. The higher the kurtosis, the higher the risk because the asset is more volatile. You can make high returns, but it can also generate large losses. ","41acb766-0dc4-4ad7-88b6-c93c45663473",[68],{"id":69,"data":70,"type":55,"version":25,"maxContentLevel":28},"3aef3d10-04d4-4f7c-92fb-fa72204f353a",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":71,"binaryCorrect":73,"binaryIncorrect":75},[72],"What is the term used to describe a distribution with a high peak, rapid decline, and heavy tails?",[74],"Leptokurtic",[76],"Platykurtic",{"id":78,"data":79,"type":25,"maxContentLevel":28,"version":25,"reviews":83},"c6002cbf-3ea7-47f5-a3a0-1391627beefd",{"type":25,"title":80,"contentRole":42,"markdownContent":81,"audioMediaId":82},"Clearing up the confusion about Kurtosis and fat tails","While a Platykurtic distribution might look like it has fatter tails, it is actually a thin-tailed distribution because outliers are infrequent. Many people get confused because the tails can look thicker – because they can be higher on the x-axis. However, a Platykurtic distribution is like an elephant – a very small proportion of its weight is in the tail. \n\n ![Graph](image://2fabd294-3638-4572-b1af-c01a164a5916 \"Different kinds of Kurtosis\")\n\nIn contrast, a Leptokurtic distribution is fat-tailed because there are a lot of outliers – not to mention these outliers can be very large and far away from the mean. A Leptokurtic distribution is like a leaping Kangaroo - a large proportion of its weight is in the tail. ","9ce2b9b7-6a1f-40be-a1a1-38655bee8139",[84],{"id":85,"data":86,"type":55,"version":25,"maxContentLevel":28},"8b0b9a00-95ee-4ba0-8982-d5ee643687fc",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":87,"clozeWords":89},[88],"A Platykurtic distribution is thin tailed, while a Leptokurtic distribution is fat tailed.",[90,91],"thin","fat",{"id":93,"data":94,"type":42,"version":25,"maxContentLevel":28,"pages":96},"df975da8-218b-4201-b321-f0eca03c7a8c",{"type":42,"title":95},"Regression Analysis and Line of Best Fit",[97,115,131,145],{"id":98,"data":99,"type":25,"maxContentLevel":28,"version":25,"reviews":103},"2cc8fe98-1b2a-4da8-b6c6-7674356dcdc5",{"type":25,"title":100,"contentRole":42,"markdownContent":101,"audioMediaId":102},"Line of best fit ","\n ![Graph](image://666bbafb-1866-484b-8bff-ba329637b194 \"A line of best fit\")\n\n\nThe line of best fit is drawn on scatter plots and represents the best prediction of the dependent variable that could be made, based on the value of the independent variable. \n\nConsider for example that we have two different dependent variable values for the exact same value of the independent variable across two observations; any estimate must therefore fall somewhere in between the two points. \n\nWhen we only have two values, we can estimate simply by taking the average of our two values. For example, if we have age on the X-axis (our independent variable) and length of commute on the Y-axis (our dependent variable). We have two data points for age 30. One commutes 30 minutes, one commutes 60 minutes. Our estimate must fall somewhere between these two values - the average would be 45 minutes. \n\nHowever, when we have many values, we need to create a reliable rule for estimation and prediction. That reliable rule is the line of best fit. In a regression analysis, it is called the regression line. ","228729b7-9c48-43dc-b1e3-ba4ba93710c0",[104],{"id":105,"data":106,"type":55,"version":25,"maxContentLevel":28},"a37e175b-db75-4a36-94c9-fa4774d480b3",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":107,"multiChoiceCorrect":109,"multiChoiceIncorrect":111},[108],"What is the name of the line that is drawn on a scatter plot to represent the best prediction of the dependent variable?",[110],"Line of best fit",[112,113,114],"Bell Curve","Middle Line","Average Line",{"id":116,"data":117,"type":25,"maxContentLevel":28,"version":25,"reviews":121},"4992e28e-b244-4814-86c3-f30188d6e1f9",{"type":25,"title":118,"contentRole":42,"markdownContent":119,"audioMediaId":120},"Regression analysis ","Regression analysis is a statistical technique that is used to model the relationship between a dependent variable and one or more independent variables. The dependent variable is the variable that is being predicted, while the independent variable is the variable that is used to make the prediction. \n\nThe goal of regression analysis is to find the best fitting model to describe the relationship between the dependent and independent variables. \n\nFor example, regression analysis can be used to understand how the price of a house (the dependent variable) is influenced by multiple independent variables like the size of the house, the area, the age and the number of bedrooms. ","b4853324-00e1-42e4-99a9-3591cd5b83f7",[122],{"id":123,"data":124,"type":55,"version":25,"maxContentLevel":28},"f0c062dd-4fc9-47a7-ab79-65e36d1d8afa",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":125,"binaryCorrect":127,"binaryIncorrect":129},[126],"What is the goal of regression analysis?",[128],"To find the best fitting model to describe the relationship between the dependent and independent variables",[130],"To predict the dependent variable",{"id":132,"data":133,"type":25,"maxContentLevel":28,"version":25,"reviews":137},"43b5b401-5e9e-422b-9292-bb1753f9bfa8",{"type":25,"title":134,"contentRole":42,"markdownContent":135,"audioMediaId":136},"Simple and multiple linear regression analysis ","\nSimple linear regression is used when we want to predict a single dependent variable using a single independent variable. For example, we might use the number of hours studied to predict a student's test score. In this case, the number of hours studied would be the independent variable, and the test score would be the dependent variable.\n\n\n\nMultiple linear regression is used when we want to predict a single dependent variable using multiple independent variables. For example, we might use a student's number of hours studied, their class attendance, and their previous test scores to predict their next test score. In this case, the number of hours studied, class attendance, and previous test scores would be the independent variables, and the next test score would be the dependent variable.\n\nIn both simple and multiple linear regression, we use statistical analysis to find the best-fit line (or equation) that describes the relationship between the independent variables and the dependent variable. This line can then be used to make predictions about the dependent variable, given a set of values for the independent variables.","a904af04-e0f3-4c2a-9348-b915b7fef277",[138],{"id":139,"data":140,"type":55,"version":25,"maxContentLevel":28},"8572fe57-8791-45e0-9380-25a10d40902f",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":141,"activeRecallAnswers":143},[142],"Which kind of linear regression is used to predict a single dependent variable using a single independent variable?",[144],"Simple linear regression",{"id":146,"data":147,"type":25,"maxContentLevel":28,"version":25,"reviews":151},"55e1be4b-92cc-4e2a-a87e-cfa9da663d17",{"type":25,"title":148,"contentRole":42,"markdownContent":149,"audioMediaId":150},"Residuals - how the line of best fit is found in regression analysis","\n\nResiduals, in statistics, are the difference between the actual value of a data point and the predicted value of that data point. The line of best fit is the line that minimizes the sum of the squared residuals.\n\nYou may hear the term ‘error’ when discussing residuals. The error is, as you might have guessed, the difference between the actual and predicted value, otherwise called the residual. \n\n\n ![Graph](image://ef4ef731-9c3f-4a63-8d7e-bd619e7da323 \" \")\n\n\nThe residuals are illustrated by the red and green lines shown in the image above. It is the distance between our line of best fit, and our value. \n\nYou will notice the blue line intercepts the y-axis at 3. This is called our ‘intercept’ and the steepness of the line is our ‘slope’. These are represented in a regression equation as follows:\n\nM = slope\n\nB = intercept \n\nX = the value of our datapoint\n\nY = mX + b \n\nIn minimizing the residuals via the ‘least squares method’ as it is called, we are finding the values of M and B that minimize the sum of squared residuals. \n\nOur line of best fit must be straight - it cannot curve. That is why it is further away from some points than it is from others. But overall, its position and slope is one that minimizes the sum of the squared errors for all points. \n","1c534b87-60f0-4fd0-8d07-96851a95c277",[152],{"id":153,"data":154,"type":55,"version":25,"maxContentLevel":28},"53cea6dd-adef-466b-ad4f-2f70a6f79e75",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":155,"multiChoiceCorrect":157,"multiChoiceIncorrect":159},[156],"What is the term used to describe the process of finding the line of best fit in regression analysis?",[158],"Least squares method",[160,161,162],"Maximum squares method","Linear squares method","Quadratic squares method",{"id":164,"data":165,"type":42,"version":25,"maxContentLevel":28,"pages":167},"be43e998-7c34-4613-b2a7-b0b495db9665",{"type":42,"title":166},"Homoscedasticity and Heteroscedasticity",[168,184,198,214],{"id":169,"data":170,"type":25,"maxContentLevel":28,"version":25,"reviews":174},"48663c0d-d89a-442b-8149-06288c219e5d",{"type":25,"title":171,"contentRole":42,"markdownContent":172,"audioMediaId":173},"Homoscedasticity ","Homoscedasticity, or 'homogeneity of variance', means constant variance within groups. It gives you an idea of how spread out your data is. If the variances are not homogeneous, the results of your tests may be biased.\n\nIn homoscedastic data, the data points will be evenly distributed. You can see an example of this in the image below. There's as much variance between the data at the start of the curve as there is in the middle or at the end. This suggests consistent data, which is easier to work with.\n\n ![Graph](image://4dfaf9f9-33c7-4d09-a3ca-53a07e87bfb1 \" \")\n\n\n\n\n\n","f99fc8b7-4d16-42a9-a3b6-ff0e4ecd0874",[175],{"id":176,"data":177,"type":55,"version":25,"maxContentLevel":28},"c3e9c0f4-daaf-4866-a497-39972b519f58",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":178,"binaryCorrect":180,"binaryIncorrect":182},[179],"What term is used to describe data in which some points are close to the mean and others are far away?",[181],"Heteroscedasticity",[183],"Homogeneity",{"id":185,"data":186,"type":25,"maxContentLevel":28,"version":25,"reviews":190},"93dc4d81-cbd1-4ee0-bf48-c502b7d1db46",{"type":25,"title":187,"contentRole":42,"markdownContent":188,"audioMediaId":189},"Heteroscedasticity ","The opposite of homoscedastic data is heteroscedastic data. This term desribes data which varies non-constantly. You can see two examples of heteroscedastic data in the image below. \n\n ![Graph](image://b389189b-1f8c-46a8-9382-d0ea3eccb030 \" \")\n\nUnlike in the homoscedastic example, the variance between the data in the heteroscedastic examples is non-constant. In the 'bow tie' example, the variance starts wide, then narrows in the middle, before widening again. In the 'fan' example, the variance starts wide, then gradually narrows. \n\n\n\n\n\n","cc87e24d-f434-4644-aa10-0320fb610e07",[191],{"id":192,"data":193,"type":55,"version":25,"maxContentLevel":28},"9336fdb8-9408-4649-8a68-9d34f5dc09a5",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":194,"clozeWords":196},[195],"Homoscedasticity is when the data exhibits equal variances, both within and between groups.",[197],"Homoscedasticity",{"id":199,"data":200,"type":25,"maxContentLevel":28,"version":25,"reviews":204},"b5a4d7f4-1f8c-4605-9c59-68841224a478",{"type":25,"title":201,"contentRole":42,"markdownContent":202,"audioMediaId":203},"Causes or sources of heteroscedasticity   ","\nHeteroscedasticity can be caused by several different factors. It can result from differences in time series data – like seasonal fluctuations – or inaccuracies in your measurement tool – for example, your measurement tool might become more and more inaccurate due to changes in the external environment over time. \n\nIt could also be that your measurement tool exhibits greater variance as the inputs it is supposed to measure become greater. For example, a device might measure the wattage of batteries, but be less accurate and exhibit higher variance in readings for higher wattages.\n","67628420-563e-4ee3-b164-a2c12898d01a",[205],{"id":206,"data":207,"type":55,"version":25,"maxContentLevel":28},"fcf5a404-cb6b-4c93-9c28-8bcebfc0bafb",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":208,"activeRecallAnswers":210},[209],"What can cause heteroscedasticity?",[211,212,213],"Differences in time series data","Inaccuracies in measurement tools","Greater variance as inputs become greater",{"id":215,"data":216,"type":25,"maxContentLevel":28,"version":25,"reviews":220},"8d4e0086-05bc-4ebf-8d79-a73c16ee03fe",{"type":25,"title":217,"contentRole":42,"markdownContent":218,"audioMediaId":219},"Implications of heteroscedasticity in predictive statistics"," ![Graph](image://94db7922-d4b2-4e53-a5b5-81331c724ab8 \" \")\n\nHeteroscedasticity occurs when the spread or dispersion of the residuals differs systematically from one part of the dataset to another.\n\nWhen conducting predictive statistics, for example by using a regression analysis, this means that your model may provide more accurate predictions at one end of the data range, while at the other end of the data range, the predictions are less accurate. \n\nYou can still perform a regression analysis on such data, but your results will be less accurate outside of a certain range. \n","86fe5b8c-bc30-4a9b-9799-4ed8370fc193",[221],{"id":222,"data":223,"type":55,"version":25,"maxContentLevel":28},"24f0b534-cb67-409d-ab54-be8c0bcf6a4a",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":224,"activeRecallAnswers":226},[225],"What term describes when a model's predictions are less accurate outside of a certain range?",[181],{"id":228,"data":229,"type":30,"maxContentLevel":28,"version":25,"orbs":232},"7a8ef660-0107-475e-b896-34d7d940fdd3",{"type":30,"title":230,"tagline":231},"Hypothesis Testing ","Learn the foundations of statistics – the foundational methodology of science’s greatest achievements.",[233,294,331,393,445],{"id":234,"data":235,"type":42,"version":25,"maxContentLevel":28,"pages":237},"c79edf2a-6cee-4721-ae05-4487a49a8ccd",{"type":42,"title":236},"Understanding Hypotheses",[238,256,272],{"id":239,"data":240,"type":25,"maxContentLevel":28,"version":25,"reviews":244},"7934c68e-0ba2-4dcd-a46a-05d549050b31",{"type":25,"title":241,"contentRole":42,"markdownContent":242,"audioMediaId":243},"What is a hypothesis? ","\nA hypothesis is a proposed explanation or prediction about a phenomenon or event that can be tested through further investigation and experimentation. It is an educated guess or an assumption based on prior knowledge, observations, and logical reasoning. In scientific research, a hypothesis is a tentative statement that can be either confirmed or refuted based on empirical evidence. \n\nA well-formulated hypothesis includes a clear and testable statement, a prediction about the expected outcome of the experiment, and a proposed explanation of why the predicted outcome would occur. \n\nThe process of testing a hypothesis involves collecting data, analyzing it, and drawing conclusions about whether or not the hypothesis is supported by the evidence.\n\nFor example, you could formulate a hypothesis that the average Kangaroo is taller than the average 12 year-old child, and then collect data. These data might either support or refute this hypothesis — though it is important to note this is not the same as *proving* or *disproving* a statement.","b9b9013a-7487-47ed-b482-aef6b0f04296",[245],{"id":246,"data":247,"type":55,"version":25,"maxContentLevel":28},"cdf2aec7-3591-4e6d-b8c6-499aeafd63d6",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":248,"multiChoiceCorrect":250,"multiChoiceIncorrect":252},[249],"What is the purpose of hypothesis testing?",[251],"To test a prediction",[253,254,255],"To make a prediction","To formulate a hypothesis","To conduct a research experiment",{"id":257,"data":258,"type":25,"maxContentLevel":28,"version":25,"reviews":262},"6997b553-fbeb-4198-bea2-761eb2764ee7",{"type":25,"title":259,"contentRole":42,"markdownContent":260,"audioMediaId":261},"Formulating hypotheses","\n\nThe formulation of hypotheses, and then testing of them, in a replicable and repeatable manner, is the foundation of the scientific method. For an experiment to be considered valid, other people need to be able to do the same experiment and get similar results.\n\nYou might start with the prediction that eating a full and balanced breakfast is good for learning. This is all well and good, but how are you going to test that? Well, you’ll need to formulate a hypothesis! \n\nHypothesis: ‘Grade 8 students who eat a balanced breakfast of wholegrains and milk will perform better on a mathematics test than students who do not eat breakfast’. \n\nDo you notice how the hypothesis is much more specific than the prediction? That’s because it has to be. So, a hypothesis is a very specifically worded and formulated prediction, that you will be able to reliably and consistently test. \n","071b28ae-ab9d-41f3-bb0b-cb2b0bad79a1",[263],{"id":264,"data":265,"type":55,"version":25,"maxContentLevel":28},"f3488766-a1b0-4693-8c9b-b161a5dde200",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":266,"binaryCorrect":268,"binaryIncorrect":270},[267],"What is the difference between a prediction and a hypothesis?",[269],"A hypothesis is more specific and can be tested reliably and consistently.",[271],"A prediction is more specific and can be tested reliably and consistently.",{"id":273,"data":274,"type":25,"maxContentLevel":28,"version":25,"reviews":278},"d99cbc35-a8a5-44c2-a30d-9a56ff510c4e",{"type":25,"title":275,"contentRole":42,"markdownContent":276,"audioMediaId":277},"The Null and Alternative Hypothesis","\n ![Graph](image://8c3b9ec1-b32e-422d-a19c-bba7922576a4 \" \")\n\nTwo key terms you need to know when it comes to hypothesis testing are the null hypothesis and the alternative hypothesis. The null hypothesis states that there is no difference between the samples we are testing – for example, placebo and treatment groups for a new medicine. The alternative hypothesis states that there is a difference. \n\nIn our kangaroo experiment, the null hypothesis is that there is no difference in height. When testing hypotheses, we can only ever reject the null hypothesis or fail to reject the null hypothesis. We can never accept the null or the alternative hypothesis. \n\nThis is because we have gathered evidence against the null hypothesis, evidence that either succeeds in allowing us to reject the null or not – and because there is always error in statistics, we cannot prove either hypothesis for certain. \n\n\n\nStatistical tests can be used to test effects of interventions – like new medicines – existing differences between populations – like whether people in Germany are taller than people in the UK – and correlations – like the strength of the relationship between fossil fuel consumption and global average temperatures. \n\n","203f50db-8121-439d-b24a-d4dffba94d8d",[279,287],{"id":280,"data":281,"type":55,"version":25,"maxContentLevel":28},"9d4f5789-6d44-4809-878c-947a6c16dd19",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":282,"activeRecallAnswers":284},[283],"What are the two kinds of hypothesis in hypothesis testing?",[285,286],"Null hypothesis","Alternative hypothesis",{"id":288,"data":289,"type":55,"version":25,"maxContentLevel":28},"cb504ed6-f3b9-4642-826f-6eb9d53f14af",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":290,"clozeWords":292},[291],"We can only ever reject or fail to reject the null hypothesis.",[293],"fail to reject",{"id":295,"data":296,"type":42,"version":25,"maxContentLevel":28,"pages":298},"6e8dd63b-e778-4ce7-9510-bad9f45ff68f",{"type":42,"title":297},"Designing Hypothesis Tests",[299,313],{"id":300,"data":301,"type":25,"maxContentLevel":28,"version":25,"reviews":305},"3fded653-6e71-43de-bb76-a0dab1efb157",{"type":25,"title":302,"contentRole":42,"markdownContent":303,"audioMediaId":304},"Study design for hypothesis testing","The two main methods for testing hypotheses on samples are between-subjects and within-subjects study designs. Between subjects tests the difference between Jack and Jill. Within Subjects tests the difference within Jack before and after he fell down the hill. The third method is mixed – or factorial – design, which is a blend of both between and within subject design. \n\n\n\n ![Graph](image://9debd62a-a350-47f1-ac1b-30fbe348de58 \"Between-subject vs. within-subject design\")\n\n\nWith a between-subjects study design, you have two different groups and you compare their outcomes, like people who take a new treatment drug and people who don’t. \n\nBetween subjects design is useful when you don’t want to introduce things like learning effects, which is possible with within-subjects design. \n\n\nWith a within-subject design, you test changes within the same person, or observation. For example, if you compare somebody’s 100m sprint time before and after taking a new energy drink, you are doing a within-subject test.\n\nPut simply, it means testing the same sample twice under different conditions.\n\nWithin-subject design is useful because it requires fewer participants. It also improves the chance that you find a true effect of your independent variable.","be7fd23e-0ac0-4b8b-bd76-8c5f32a7c007",[306],{"id":307,"data":308,"type":55,"version":25,"maxContentLevel":28},"4e0eff23-e22d-4f6b-814b-c7843825d8f1",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":309,"activeRecallAnswers":311},[310],"What is the other term used for 'mixed' hypothesis testing methods?",[312],"Factorial",{"id":314,"data":315,"type":25,"maxContentLevel":28,"version":25,"reviews":319},"52897b8c-17ed-4cf0-a068-d7ff553ddb5a",{"type":25,"title":316,"contentRole":42,"markdownContent":317,"audioMediaId":318},"Mixer or Factorial design  ","\nMixer – or factorial – design means testing different groups under different conditions.\n\nFor example, you might compare 100m sprint time on first attempt and second attempt for two different groups  – people who tried a new energy drink between attempts 1 and 2, and people who didn’t take the energy drink. \n\nThat way you can test if it was the energy drink that is responsible for any effects, or if people are just slower/faster on their second attempt. \n","b294b55f-ee5d-47d1-b06b-9da31353e6dd",[320],{"id":321,"data":322,"type":55,"version":25,"maxContentLevel":28},"fa6f155c-5295-4208-bc4c-7016a22a0100",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":323,"multiChoiceCorrect":325,"multiChoiceIncorrect":327},[324],"What type of design is used to compare two different groups under different conditions?",[326],"Mixer or Factorial design",[328,329,330],"A/B Testing","Control Group","Randomized Design",{"id":332,"data":333,"type":42,"version":25,"maxContentLevel":28,"pages":335},"75750bdb-59e6-4b10-8223-555d970ae6c9",{"type":42,"title":334},"Errors in Hypothesis Testing",[336,354,377],{"id":337,"data":338,"type":25,"maxContentLevel":28,"version":25,"reviews":342},"921336bd-45db-4688-834a-b135456170a4",{"type":25,"title":339,"contentRole":42,"markdownContent":340,"audioMediaId":341},"Type I errors in hypothesis testing","\nThere are two main types of error that you can make when testing your hypothesis – type I errors and type II errors. \n\nFirst, you have type I errors. These are where we mistakenly reject a null hypothesis. In other words, we claim that something is true when it is false. \n\nThese are also known as false positives – for example, when you tell everyone about your great new study which found that this cool new supplement increases your test scores by 10%. But actually your sample size was small – it turns out you were wrong. So you’ve gone and claimed a positive correlation incorrectly.\n","68fb82d7-1495-4397-9c67-61562ae96c6b",[343],{"id":344,"data":345,"type":55,"version":25,"maxContentLevel":28},"fad636ea-9c9b-4d84-b880-54f9ccdc3ada",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":346,"multiChoiceCorrect":348,"multiChoiceIncorrect":350},[347],"What is the term for when we mistakenly reject a null hypothesis?",[349],"Type I error",[351,352,353],"Type II error","Type III error","Type IV error",{"id":355,"data":356,"type":25,"maxContentLevel":28,"version":25,"reviews":360},"e79664ea-10eb-48f9-8d45-c315e9b63f42",{"type":25,"title":357,"contentRole":42,"markdownContent":358,"audioMediaId":359},"Type II errors in hypothesis testing","\nA type II error is when we fail to reject a null hypothesis, when that hypothesis is actually false. This is also known as a false negative.\n\nFor example, let's say a new drug is being tested to see if it helps lower cholesterol levels. The null hypothesis is that the drug has no effect on cholesterol levels. The alternative hypothesis is that the drug does have an effect on cholesterol levels. \n\nThe alternative hypothesis is correct – the drug has an effect on cholesterol levels. But we don't notice that effect, so we fail to reject the null hypothesis. This is an example of a type II error: we failed to reject the false null hypothesis.\n","4420e807-b16d-4a25-bbb6-ea3c82ac9c65",[361,368],{"id":362,"data":363,"type":55,"version":25,"maxContentLevel":28},"1fb861bc-d4c0-4f30-991c-4213874ca94b",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":364,"clozeWords":366},[365],"A type II error is also known as a false negative.",[367],"false negative",{"id":369,"data":370,"type":55,"version":25,"maxContentLevel":28},"c8b5a562-2c16-4347-a7d2-9af6f313e17d",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":371,"multiChoiceCorrect":373,"multiChoiceIncorrect":374},[372],"What is the name of the error that occurs when we mistakenly accept a null hypothesis, thinking something is false when it is actually true?",[351],[349,375,376],"Hypothesis Testing","False positive",{"id":378,"data":379,"type":25,"maxContentLevel":28,"version":25,"reviews":383},"4cd8b3dc-4369-4498-90af-3de5de58b8a1",{"type":25,"title":380,"contentRole":42,"markdownContent":381,"audioMediaId":382},"Confidence intervals "," ![Graph](image://665e6a79-bdbf-429b-a17f-8a29ae162b5e \" \")\n\nConfidence intervals are a way to express the level of uncertainty around a statistic, such as a sample mean. They provide a range of values that is likely to contain the true population parameter with a certain level of confidence.\n\nFor example, if you take a sample of students and calculate their average test score, you can use a confidence interval to express the range of scores that you expect the average test score for the entire student population to fall within.\n\nA common level of confidence used is 95%. This means that if you were to repeat the process of taking a sample and calculating a confidence interval many times, about 95% of the intervals would contain the true population parameter.\n\n","63d763a9-3d0c-4f8c-a61c-4f2588758fb0",[384],{"id":385,"data":386,"type":55,"version":25,"maxContentLevel":28},"bd715464-ce73-4b89-b1f0-158c41c94f61",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":387,"binaryCorrect":389,"binaryIncorrect":391},[388],"What is the common level of confidence used when calculating a confidence interval?",[390],"95%",[392],"90%",{"id":394,"data":395,"type":42,"version":25,"maxContentLevel":28,"pages":397},"56265349-265d-4ada-a7ff-7795e21423ea",{"type":42,"title":396},"Statistical Significance",[398,412,428],{"id":399,"data":400,"type":25,"maxContentLevel":28,"version":25,"reviews":404},"6cc5b9a4-429a-480d-a509-583c135b5b7d",{"type":25,"title":401,"contentRole":42,"markdownContent":402,"audioMediaId":403},"The P-value"," ![Graph](image://6249f5ab-b57b-4173-9b2a-da136a882592 \"An illustration of the P-value\")\n\nA P-value is a way to help you decide whether the results of a study are strong enough to support a certain conclusion. Basically, if there was no relationship between your variables, how likely would it be for your results to happen completely by chance?\n\nThe P-value, or 'probability value', is a number between 0 and 1. The smaller the P-value, the less likely it would be for these results to happen completely by chance. For example, if the P-value is 0.5, that means it's 50% likely that the results you're seeing are due to chance. \n\nA good P-value is anything less than 0.05 – a 5% likelihood that your results are down to chance, and a 95% likelihood that they are down to a real relationship. If the P-value is greater than 0.05, you can't be sure that the results are not due to chance, and you can't say for sure that the treatment caused the effect. ","bda263a4-6de6-445b-8e1b-ba5c3469c59e",[405],{"id":406,"data":407,"type":55,"version":25,"maxContentLevel":28},"cf2f6d07-8332-4ddd-b4ac-e91cc58026a2",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":408,"clozeWords":410},[409],"A P-value is a number between 0 and 1 that tells you the likelihood that the results you are seeing are just due to chance.",[411],"P-value",{"id":413,"data":414,"type":25,"maxContentLevel":28,"version":25,"reviews":418},"bac4a3f4-518d-4fe4-920a-3056efbd7ab5",{"type":25,"title":415,"contentRole":42,"markdownContent":416,"audioMediaId":417},"The P-value & Hypothesis testing","We can never prove the alternative hypothesis to be right. But, we can reject the null hypothesis. Which means we are certain, with a p-value and level of statistical significance to back it up, that there is a difference. \n\nOur level of significance is something we set ourselves before we conduct our study, but common values include .01 and .05. For example, if we set .05 as our p-value, and from our study we get a p-value of less than .05 – that tells us there is a less than 5% chance that our data came from a population where the null hypothesis is true, we are quite certain that there is a difference – so we can reject the null hypothesis.\n\n For example, imagine a study comparing the heights of men and women. Our null hypothesis would be that there is no statistically significant difference. It is the P value that we use to determine whether differences are statistically significant. In other words, if we have a P value of less than .05, we say that there is less than a 5% chance our results are due to chance.","f268ebb8-9f89-445c-bd2c-080e1a40ad2d",[419],{"id":420,"data":421,"type":55,"version":25,"maxContentLevel":28},"5542885f-f504-4c93-9f7f-7a466b74b901",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":422,"binaryCorrect":424,"binaryIncorrect":426},[423],"What is the value of the P-value which is commonly used to determine if differences are statistically significant?",[425],"0.05",[427],"0.01",{"id":429,"data":430,"type":25,"maxContentLevel":28,"version":25,"reviews":434},"2109a9ae-7d80-4edd-97f9-d001398746ec",{"type":25,"title":431,"contentRole":42,"markdownContent":432,"audioMediaId":433},"Statistical power ","\nStatistical power is a measure of how likely a statistical test is to detect an effect, if one exists. It is something you calculate before you run your analysis – to ensure that you have enough data to draw the conclusions that you would like to be able to draw from your test.\n\nThere are four things you need to know when it comes to calculating the power of your test – the type of statistical test you plan to use, the significance level you are using, the sample size you are planning to use, and the effect size you want to be able to detect. \n\nWith this information you can calculate the power of your study design to achieve the aims you would like it to. \n\nIf your power is not high enough, you need to either recruit more participants or find more data/observations – increase your sample size – or lower the expectations for your study and its ability to detect large effect sizes or find very statistically significant results.\n","939919f1-6109-49b9-9db8-8e2cd68b3dc4",[435],{"id":436,"data":437,"type":55,"version":25,"maxContentLevel":28},"1544e94b-993e-40c0-ad2f-606baa2acc11",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":438,"activeRecallAnswers":440},[439],"What four pieces of information are needed to calculate the power of a statistical test?",[441,442,443,444],"Type of statistical test","Significance level","Sample size","Effect size",{"id":446,"data":447,"type":42,"version":25,"maxContentLevel":28,"pages":449},"8cdf3001-446d-4828-9505-b6832d56ddf7",{"type":42,"title":448},"Types of T-Tests",[450,456],{"id":451,"data":452,"type":25,"maxContentLevel":28,"version":25},"3a3e611c-390b-4fd7-b086-983e8e82e5c1",{"type":25,"title":453,"contentRole":42,"markdownContent":454,"audioMediaId":455},"One-tailed versus two-tailed tests ","\n ![Graph](image://fa1f6ad1-c0a8-41e0-a657-d6f0e82cde8f \" \")\n\t\n\nA one-tailed t-test and a two-tailed t-test are both statistical methods used to determine whether a sample mean is significantly different from a known or hypothesized population mean. The main difference between the two tests is the direction of the difference that is being tested.\n\nA one-tailed t-test is used when the research hypothesis predicts the direction of the difference (i.e., whether the sample mean is greater or less than the population mean). It is also known as a directional test. For example, a one-tailed t-test could be used to test whether a new drug is more effective than a placebo.\n\nA two-tailed t-test, on the other hand, is used when the research hypothesis does not predict the direction of the difference. It is also known as a non-directional test. For example, a two-tailed t-test could be used to test whether a new drug is different from a placebo, without specifying whether it is more or less effective.\n\n","07daa1d5-c23c-4a60-873e-105ba7e1da7d",{"id":457,"data":458,"type":25,"maxContentLevel":28,"version":25,"reviews":462},"d1909a81-8a0d-4ea3-b1be-91f25101ff4d",{"type":25,"title":459,"contentRole":42,"markdownContent":460,"audioMediaId":461},"One-tailed t-tests, an example ","A one-tailed t-test is a statistical method used to determine whether a sample mean is significantly different from a known or hypothesized population mean. The test is used when the research hypothesis predicts the direction of the difference (i.e., whether the sample mean is greater or less than the population mean).\n\nFor example, let's say a pharmaceutical company wants to test the effectiveness of a new drug for treating depression. They randomly assign patients to either the treatment group (receiving the new drug) or the control group (receiving a placebo). \n\nAfter 8 weeks, the researchers measure the severity of depression using a standardized scale. They can use a one-tailed t-test to determine if the mean depression score for the treatment group is significantly lower than the mean depression score for the control group. In this case, the research hypothesis would be that the new drug is effective in reducing depression.","149948a3-7f8d-40c5-aeca-1b5bfe28c799",[463],{"id":464,"data":465,"type":55,"version":25,"maxContentLevel":28},"350ec3a4-aa75-4dfc-8b6f-34b72850a353",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":466,"multiChoiceCorrect":468,"multiChoiceIncorrect":470},[467],"What type of test would be used to determine if the mean depression score for a treatment group is significantly lower than the mean depression score for the control group?",[469],"One-tailed t-test",[471,472,473],"Two-tailed t-test","ANOVA","Chi-Square",{"id":475,"data":476,"type":30,"maxContentLevel":28,"version":25,"orbs":479},"7ad1db0b-2f3e-4679-b8dc-505c1de14b12",{"type":30,"title":477,"tagline":478},"Variable Types and Effect Size","Learn advanced concepts in statistics such as confounding variables, z-scores, and effect size",[480,561,594],{"id":481,"data":482,"type":42,"version":25,"maxContentLevel":28,"pages":484},"cd087d2e-74be-48d9-b47c-4ed5d8b321df",{"type":42,"title":483},"Variables in Research",[485,503,517,533,547],{"id":486,"data":487,"type":25,"maxContentLevel":28,"version":25,"reviews":491},"9173fb8d-db47-45f9-a3d9-77d6dde2bfc2",{"type":25,"title":488,"contentRole":42,"markdownContent":489,"audioMediaId":490},"Independent variables versus dependent variables ","\nWhenever you’re conducting an experiment to see how one thing affects another, you will have at least one Independent Variable – IV – and at least one Dependent Variable – DV. So what does that mean? \n\n![Graph](image://11a98e37-9d6a-47f1-b673-afd4d89123f5 \"A teenager playing video games\")\n\nYour independent variable is the variable you think is the cause of your effect. For example, imagine that your hypothesis was that ‘playing video games leads to greater creativity’. In this example, your independent variable would be ‘time spent playing video games’.\n\nYour dependent variable is the variable that you think is dependent on the independent variable. In the above example, your dependent variable would be the ‘amount of creativity’. Put simply, you think that changes in the independent variable cause the dependent variable to change. \n\nGenerally, when we are conducting experiments, we are manipulating one variable, which is the independent variable and observing or measuring the effects on the dependent variable. In this case, we are making people play more video games and then testing their creativity! So, if in doubt, ask yourself, which one are you manipulating? That’s your independent variable.","582328ce-c679-4f06-b750-05b0033f064d",[492],{"id":493,"data":494,"type":55,"version":25,"maxContentLevel":28},"abf72bf3-a60c-47e1-b7d8-a43e655ab32e",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":495,"multiChoiceCorrect":497,"multiChoiceIncorrect":499},[496],"What is the variable that is being manipulated in an experiment?",[498],"Independent Variable",[500,501,502],"Dependent Variable","Effect Size","Variable Types",{"id":504,"data":505,"type":25,"maxContentLevel":28,"version":25,"reviews":509},"9371ec2a-f3f7-4c58-84cf-f01950aba771",{"type":25,"title":506,"contentRole":42,"markdownContent":507,"audioMediaId":508},"Mediating variables","A mediating variable is a third variable that sits between two variables, being affected by the first, and also influencing the second - the first being your independent variable, and the second being your dependent variable. \n\nAs an example, if you’re studying the effects of sleep on educational achievement, alertness could be an mediating variable. So there is a causal pathway that runs from amount of sleep (independent), to alertness (mediating), to academic achievement (dependent). \n\nThe result here is that academic achievement could be improved by influencing the independent variable (getting more sleep) but only because it influences the mediating variable (alertness). Another factor that influences the mediating variable, like doing exercise in the morning, might also have the same effect.\n","36dd3e7f-ab07-4b62-a1bd-22c1076afa15",[510],{"id":511,"data":512,"type":55,"version":25,"maxContentLevel":28},"5fdd1a84-db0f-4313-a4b1-5816cb1b3e7f",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":513,"activeRecallAnswers":515},[514],"What is a third variable that sits between two variables and influences the second?",[516],"Mediating variable",{"id":518,"data":519,"type":25,"maxContentLevel":28,"version":25,"reviews":523},"03575334-0070-4270-b33d-d407e88ff59a",{"type":25,"title":520,"contentRole":42,"markdownContent":521,"audioMediaId":522},"Moderating variables","Where a mediating variable is almost like a go-between for two variables, a moderating variable is slightly different. Instead of connecting the independent variable and the dependent variable together, it directly impacts the strength or direction of the relationship that already exists between them.\n\nIn a study of sleep and educational achievement, coffee drinking could be a moderating variable. If a participant drinks a lot of coffee (moderating), it might reduce the extent to which the amount of sleep (independent) has an impact on academic achievement (dependent).  \n\nModerating variables affect not only the strength of relationships, but also the direction. Imagine a study of education level and marital prospects. Men of higher levels of education might be more likely to marry than men with lower levels of education, whereas women with higher levels of education might actually be less likely to marry than women with lower levels of education. In this case, gender is a moderating factor that changes the direction of the relationship.","d591e136-0490-4b37-aeeb-6c15cda60a12",[524],{"id":525,"data":526,"type":55,"version":25,"maxContentLevel":28},"f2797e50-2f75-410f-997e-30353252dfcb",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":527,"binaryCorrect":529,"binaryIncorrect":531},[528],"How can moderator variables affect the results of a study?",[530],"They can influence the strength and direction of the relationship",[532],"They are the cause of the relationship",{"id":534,"data":535,"type":25,"maxContentLevel":28,"version":25,"reviews":539},"ee54b91c-336b-4198-8a35-829ff6e7fcc4",{"type":25,"title":536,"contentRole":42,"markdownContent":537,"audioMediaId":538},"Confounding variables, an example"," ![Graph](image://8e5f1101-e05a-4486-a34d-123ad1e6d268 \"Two curves tracking violent crime and ice cream sales\")\n\nTo understand confounding variables, let’s pose the question – can an ice cream commit murder? \n\nWhat if I told you that as the sales of ice cream rise, so does the murder rate? That means there is a positive correlation between sales of ice cream and the murder rate and your statistical tests show that this relationship is significant. So does that mean that ice cream is causing violence?\n\nOr, is there a confounding variable here at play? Well, most likely. And that confounding variable is temperature. As the temperature rises, more people are out and interacting instead of at home hiding from the cold. As a result, there is an increase in the murder rate. Perhaps people get a little hot and bothered, too. The sales of ice cream also go up when the temperature rises. So, temperature is a confounding variable that could cause you to wrongly blame ice cream for acts of violence.\n","e152b2ab-d04b-4976-a0ca-fa3c87109795",[540],{"id":541,"data":542,"type":55,"version":25,"maxContentLevel":28},"6bd1cbe5-8f85-4c6a-99f3-91d9dd2708ae",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":543,"activeRecallAnswers":545},[544],"What is a confounding variable?",[546],"A variable that can explain the relationship between two other variables without either of them causing the other",{"id":548,"data":549,"type":25,"maxContentLevel":28,"version":25,"reviews":553},"2c324ee9-6bee-47dc-8a2b-c3f0a0c0d1cb",{"type":25,"title":550,"contentRole":42,"markdownContent":551,"audioMediaId":552},"Mediating variables versus confounding variables ","A mediating variable is a variable that acts as a link between an independent variable and a dependent variable. The relationship is causal, and all variables are in fact related. \n\nA confounding variable on the other hand is a variable that affects the relationship between two variables that share no causal relationship, two independent variables. The confounding variable makes it seem as if they are related, but they are not. \n\nAs you can see in the graphic below, a confounding variable influences or is correlated with the two independent variables below. \n\n ![Graph](image://f592393d-cea7-4e7e-9ff3-8a681018e3fb \" \")\n\nThis is in comparison to the mediating variable, which sits within a causal chain between the independent and dependent variable.\n\n ![Graph](image://1ef93242-2056-4acd-b23f-3e71a5df8475 \" \")\n\n","7c76dd5d-19c5-4b7e-a269-d57943dfeac7",[554],{"id":555,"data":556,"type":55,"version":25,"maxContentLevel":28},"31eb0215-68ad-4bcd-a39e-11858337ad4e",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":557,"activeRecallAnswers":559},[558],"What is a variable that affects the relationship between two variables that share no causal relationship?",[560],"Confounding variable",{"id":562,"data":563,"type":42,"version":25,"maxContentLevel":28,"pages":565},"0204f03b-6221-42bc-b502-616dbcf17c3f",{"type":42,"title":564},"Understanding Z-Scores",[566,580],{"id":567,"data":568,"type":25,"maxContentLevel":28,"version":25,"reviews":572},"3d17a77e-1163-4e55-a124-23314b199b53",{"type":25,"title":569,"contentRole":42,"markdownContent":570,"audioMediaId":571},"Z-scores ","\nThe z-score tells you how far away a value is from either a known population mean or your sample’s mean. Specifically, the z-score tells you how many standard deviations away it is. \n\nSo what does that all mean? \n\nWell, with the z-score, you can find the percentile that corresponds to your value. It’s really useful for figuring out how awesome you did on your test because you used Kinnu. For example, let’s say you score 180 on your test which is way above the class average of 150. Assume that your distribution has a standard deviation of 10. So what is your z-score? Well, in mathematical notation the formula looks like this:\n\n ![Graph](image://44914e43-2021-4065-9a26-cfff0d229130 \" \")\n\nIn more plain English, it looks like this: \n\nz = (x - mean) / standard deviation, where x is the data point, mean is the mean of the dataset, and standard deviation is the standard deviation of the dataset.\n\nSo, we know your score is 3 standard deviations from the mean… Now what? Well that’s when we use a z-table. The Z-Table, which is a table of stored values that you can find online or in most scientific calculators, will show you that a value 3 standard deviations above the mean is in the 99th percentile – 99.87 – which means you scored better than 99.87% of test takers! ","f5afe5b2-366b-4323-9e3c-5562f4c147ef",[573],{"id":574,"data":575,"type":55,"version":25,"maxContentLevel":28},"8ecb85a3-79c7-4c86-a861-e4bcb39d7233",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":576,"clozeWords":578},[577],"The Z-score tells you how many standard deviations away a value is from the mean and can be used to find the percentile of the value.",[579],"Z-score",{"id":581,"data":582,"type":25,"maxContentLevel":28,"version":25,"reviews":586},"ca2545c0-f806-4a2d-a7f5-d166bd7b6847",{"type":25,"title":583,"contentRole":42,"markdownContent":584,"audioMediaId":585},"Z-tables","\nIf you know how to calculate a z-score you’ll be able to compare your result to a z-table, to find the corresponding percentile value. As an example, if your test score was 145, but the average for your class was only 100, with a standard deviation of 15, then you could calculate your z-score and compare it to the z-table to find out just how smart you are. \n\n ![Graph](image://08248cad-2342-443a-8aa2-75b4ead2ba71 \" \")\n\nLet’s look at that on the z-table… \n\n ![Graph](image://e39c67b6-1cc3-4c15-a5fd-8c844a0a768c \" \")\n\nFirst you take your first number – before the decimal – in our case, 3. Locate that on the y-axis – the vertical axis – and then locate your decimal, in our case 0, on the x-axis – the horizontal axis. The value at the intersection of these, is your percentile. In our case, your test score was higher than 99.87% of people!\n\n","951c3ee4-83fe-48f6-8cba-e3cc2a9dd76b",[587],{"id":588,"data":589,"type":55,"version":25,"maxContentLevel":28},"31324261-133a-4633-95eb-8f2aa3467aa1",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":590,"activeRecallAnswers":592},[591],"How can you use a z-score to find a corresponding percentile value?",[593],"Compare the z-score to a z-table",{"id":595,"data":596,"type":42,"version":25,"maxContentLevel":28,"pages":598},"e76e15f2-c525-4bec-af44-517fc640fee0",{"type":42,"title":597},"Understanding Effect Size",[599,617,633],{"id":600,"data":601,"type":25,"maxContentLevel":28,"version":25,"reviews":605},"a2830d79-0a81-426b-ab2a-c2d648150756",{"type":25,"title":602,"contentRole":42,"markdownContent":603,"audioMediaId":604},"Effect size ","\nWhile statistical significance – the p-value – is influenced by the number of observations in your sample, the effect size is not. Effect size doesn’t care about how many observations you have. Effect size is based purely on the actual data – the measurements – not the number of measurements you have. \n\nCommon measures of effect sizes include Cohen’s d which is used whenever you’re comparing two means – for example, ‘does strength training for 5 weeks improve a basketballer’s vertical jump’ where you compare the mean jump height both before and after the 5 week strength training example. \n\nAnother measure of effect size you might already be familiar with is Pearson’s correlation coefficient. Again, it doesn’t matter how many observations you have, just how strongly correlated the data is. Pearson’s correlation coefficient measures the strength of the relationship between variables – like the relationship between time spent studying in Kinnu and test scores. \n","fff10b1d-b352-4b42-871a-fc91c7353083",[606],{"id":607,"data":608,"type":55,"version":25,"maxContentLevel":28},"1284d6e0-c121-4d39-ae83-1703d03dc7cc",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":609,"multiChoiceCorrect":611,"multiChoiceIncorrect":613},[610],"What is the measure of effect size used to compare two means?",[612],"Cohen's d",[614,615,616],"Pearson's correlation coefficient","Spearman's rank correlation coefficient","Kendall's tau",{"id":618,"data":619,"type":25,"maxContentLevel":28,"version":25,"reviews":623},"fde90022-c072-4994-9b22-aa9992c636d4",{"type":25,"title":620,"contentRole":42,"markdownContent":621,"audioMediaId":622},"Interpreting effect size ","\nThe two main measures for effect size are Cohen’s d – for comparing the magnitude of the difference between two means – and Pearson’s r for finding the strength of a correlation. Neither tell you the significance of a relationship, but they do tell you the size or magnitude of it. This tells you how important an entry is in the context of your data. \n\nBelow is a rough guide as to what constitutes a small or a large effect size. Just keep in mind that Pearon’s r can range between -1 and 1. To use the table below, just take the absolute value and ignore any minus sign. \n\nCohen’s d on the other hand can be anywhere between 0 and infinity! But it only takes a value of 0.8 or more for it to be considered a large effect size.\n\n ![Graph](image://06ba7fe3-d7a6-4669-a24b-9ebe640efdcd \" \")","e5d3db7d-2331-4e3c-b73d-a20a5803df50",[624],{"id":625,"data":626,"type":55,"version":25,"maxContentLevel":28},"3b3e1344-41ac-4aaf-b403-e98ac440b211",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":627,"binaryCorrect":629,"binaryIncorrect":631},[628],"Cohen's d is considered a large effect size when it is greater than what?",[630],"0.8",[632],"0.6",{"id":634,"data":635,"type":25,"maxContentLevel":28,"version":25,"reviews":639},"232c486d-2a0e-4791-9cd3-bfa741d40d19",{"type":25,"title":636,"contentRole":42,"markdownContent":637,"audioMediaId":638},"How effect size relates to sample size and power","\nWhile effect size is not dependent on your sample size, you can use it to calculate what sample size you might need to reliably detect an effect size of the desired magnitude for your study. Because your ability to detect a statistically significant difference – via your p-value – is related to your sample size.\n\nFor example, if you know you want to find a large effect size in your experiment, if one exists – then you will need enough observations in your sample so that the statistical test has enough power to detect an effect of that size. The effect is either in your data or it’s not. The difference of that size is either there or it is not. But, you need enough observations to uncover it, if it is there. The number of observations don’t create the effect size, but they do help you uncover it. \n\nMore observations gives your test more statistical power to detect these effect sizes. More statistical power requires more observations. \n","e0cebadb-9ae7-4d60-b292-d3a60d4f7a48",[640],{"id":641,"data":642,"type":55,"version":25,"maxContentLevel":28},"3bb2b3be-0750-4acb-97b7-4dce25e72881",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":643,"binaryCorrect":645,"binaryIncorrect":647},[644],"What is the relationship between sample size and power in detecting an effect size?",[646],"More observations gives the test more statistical power to detect the effect size.",[648],"Sample size determines the effect size.",{"id":650,"data":651,"type":30,"maxContentLevel":28,"version":25,"orbs":654},"7d7345ec-ebb8-4075-b241-1ce471c715da",{"type":30,"title":652,"tagline":653},"Samples and Populations (Metrics)","How samples and populations work for advanced statistics.",[655,696],{"id":656,"data":657,"type":42,"version":25,"maxContentLevel":28,"pages":659},"1efde4e1-af9c-4e3c-8e86-183ef20b0990",{"type":42,"title":658},"Understanding Parameters and Statistics",[660,674,680],{"id":661,"data":662,"type":25,"maxContentLevel":28,"version":25,"reviews":666},"a3f166ab-2711-4236-951b-23aec5f8c03e",{"type":25,"title":663,"contentRole":42,"markdownContent":664,"audioMediaId":665},"Parameters versus statistics"," ![Graph](image://c7ed5bf6-ef22-42dc-a5c9-30a11a5e5bfd \"Parameters vs. Statistics\")\n\nStatistics and parameters are related but distinct concepts in statistics.\n\nA parameter is a fixed value that describes a population. It is a characteristic of a population that is usually unknown, but can be estimated using sample statistics. For example, the population mean (μ) of the height of adult humans is a parameter. It is the true average height of all adult humans, but it is not known exactly.\n\nStatistics, on the other hand, are values that describe a sample. They are used to make inferences about the population from which the sample was drawn. For example, if we take a sample of 100 adult humans and measure their heights, we can calculate the sample mean (x̄) which is an estimate of the population mean (μ). The sample mean is a statistic, not a parameter.","d6ca9dea-6f26-49ab-bc44-d145cc077c56",[667],{"id":668,"data":669,"type":55,"version":25,"maxContentLevel":28},"98154303-5d67-43d0-9f3f-0d83e8646527",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":670,"clozeWords":672},[671],"Parameters are characteristics of a population, while statistics are values that describe a sample.",[673],"statistics",{"id":675,"data":676,"type":25,"maxContentLevel":28,"version":25},"82364917-43d8-48d1-945f-7ea7e17ac66f",{"type":25,"title":677,"contentRole":42,"markdownContent":678,"audioMediaId":679},"Symbology for parameters versus statistics","\nWhen we talk about populations and samples, we use different symbology for each, even when we are talking about the same concept. Take variance as an example.\n\nIf it’s the population parameter, we use sigma squared – σ². but if it is a sample statistic, we use S². Both mean variance, which is a measure of how far away from the mean your data is spread out. So, make sure you use the correct symbols depending on whether you’re talking about populations or samples!\n\nHere’s a table to make it easier!\n\n\n ![Graph](image://12d207cd-86ce-47b0-9964-201e2bb6024b \" \")","ec0c4365-3eb1-4e83-b16f-4792fc1618de",{"id":681,"data":682,"type":25,"maxContentLevel":28,"version":25,"reviews":686},"bbc75d8f-cae0-4471-859a-02e429ceada0",{"type":25,"title":683,"contentRole":42,"markdownContent":684,"audioMediaId":685},"Point Estimates ","\nA point estimate is when we make a prediction or inference about our population based on our sample data. Consider for example, if you calculated the average mean height of a random sample of 1000 people taken from your town, which is your population of interest. \n\nThen, you could use that mean as a point estimate for the entire population. So from only a small group of people, you can estimate the average height of your town, or even your entire country! \n\n\n ![Graph](image://06e399da-316f-4e28-aa65-e30662128469 \"x\")","3df3b6ea-7b22-4208-b14a-0bec846bf8e8",[687],{"id":688,"data":689,"type":55,"version":25,"maxContentLevel":28},"e9a1dafb-ac47-4e1f-aea9-bfc8d3576954",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":690,"binaryCorrect":692,"binaryIncorrect":694},[691],"What is a point estimate used for?",[693],"To make a prediction or inference about a population based on sample data",[695],"To measure the entire population",{"id":697,"data":698,"type":42,"version":25,"maxContentLevel":28,"pages":700},"bb9103ab-9624-460e-a787-b027525163ba",{"type":42,"title":699},"Estimation Techniques",[701,719,737],{"id":702,"data":703,"type":25,"maxContentLevel":28,"version":25,"reviews":707},"ff88f4fe-01cb-4a07-841b-0db07f3080dc",{"type":25,"title":704,"contentRole":42,"markdownContent":705,"audioMediaId":706},"Interval Estimates ","\nAn interval estimate, like the confidence interval, gives you a little more room to move than a point estimate. Instead of saying ‘we estimate the average height of adult males in London is 175cm’, which is a point estimate – you can say ‘we are 95% certain that the average height of males in London is between 165 and 180cm’, which is the 95% confidence interval, an interval estimate. \n\nInterval estimates are used to account for uncertainty – a sample can’t tell you everything about a population with 100% accuracy! Sometimes your 95% confidence interval can be so broad that you might not be able to make a decision based on the data, especially if getting it wrong is risky! \n\n\n ![Graph](image://ad5150bf-7c02-4752-af44-1444e32cb554 \"x\")","f409f9e7-b1fc-45fa-95ba-87e33fcf2b28",[708],{"id":709,"data":710,"type":55,"version":25,"maxContentLevel":28},"d4efac7e-3d91-4fac-ba06-b8fd8c2b1ec6",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":711,"multiChoiceCorrect":713,"multiChoiceIncorrect":715},[712],"What type of estimate accounts for uncertainty by providing a range of values rather than a single value?",[714],"Interval estimate",[716,717,718],"Point estimate","Sample estimate","Population estimate",{"id":720,"data":721,"type":25,"maxContentLevel":28,"version":25,"reviews":725},"541ca47d-2c72-44c3-9685-50b3a0db3304",{"type":25,"title":722,"contentRole":42,"markdownContent":723,"audioMediaId":724},"Sampling error ","\nWith any statistical test, you will always have some sampling error – that’s because your sample is always going to be smaller than the actual population, which introduces error into your estimates and predictions. \n\nIt means that there is a difference between your sample statistic, for example, the mean, and the population parameter. In the case of the mean, the population parameter that is estimated is called the ‘true average’. \n\nEven the best subject design is subject to sampling error, which is why when we report estimates of population parameters, we often do so in terms of confidence intervals. \n","c2a6c9e6-0861-4c08-9554-e24b3552b389",[726],{"id":727,"data":728,"type":55,"version":25,"maxContentLevel":28},"ad087ea0-8a95-41b9-80be-fd402a1070af",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":729,"multiChoiceCorrect":731,"multiChoiceIncorrect":733},[730],"What is the term used to describe the difference between a sample statistic and a population parameter?",[732],"Sampling error",[734,735,736],"Estimation error","Population error","Parameter error",{"id":738,"data":739,"type":25,"maxContentLevel":28,"version":25,"reviews":743},"68aa1454-f007-4e09-8c55-5adb5a2b37d8",{"type":25,"title":740,"contentRole":42,"markdownContent":741,"audioMediaId":742},"Using populations and samples in practice ","\nThe population is a larger group we are interested in studying. For example, all people in the USA. A sample is the portion of that group we were able to gather, like a randomly selected group of 100 Americans. An observation a value we collect about as part of a study - for example, that one participant’s name is James would be an observation \n\nIn statistics, we use our sample to make inferences about the population at large. This is done using summary statistics like the mean, median, and also confidence intervals. \n\nWe can measure things about our sample, like their height. And we can infer from that what the average height of all people in America might be. From our sample data, we can create a 95% confidence interval and we can infer that we are 95% sure that the average height of the American adult population is between 160cm and 183cm. What we cannot do is conclude that the average height is exactly 175cm, because there is always room for error in statistics. \n\nWe could do the same, and instead measure support for a new government policy or proposal. So, statistics can be very useful across a broad range of fields. This is why everyone can benefit from learning a little bit of statistics.\n","ee4e5b00-bc9a-48a4-9c63-503d322cf70e",[744],{"id":745,"data":746,"type":55,"version":25,"maxContentLevel":28},"1e2241ce-66ef-4378-9f98-219c1cb3e8e6",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":747,"multiChoiceCorrect":749,"multiChoiceIncorrect":751},[748],"What can we use to make inferences about the population at large?",[750],"Summary statistics",[752,753,754],"Interval confidences","Statistical summaries","Confidence intervals",{"id":756,"data":757,"type":30,"maxContentLevel":28,"version":25,"orbs":760},"e797a672-deab-425b-8038-85c709acd6d7",{"type":30,"title":758,"tagline":759},"Features ","Learn how to manipulate and transform variables for statistics and data science",[761,816],{"id":762,"data":763,"type":42,"version":25,"maxContentLevel":28,"pages":765},"cd7dcd62-5a31-499e-8b55-bd5012e48060",{"type":42,"title":764},"Feature Engineering Techniques",[766,784,802],{"id":767,"data":768,"type":25,"maxContentLevel":28,"version":25,"reviews":772},"0578cec0-0fdc-48c6-86d9-e8359e2d19eb",{"type":25,"title":769,"contentRole":42,"markdownContent":770,"audioMediaId":771},"Feature engineering ","\n\nFeature engineering is the process of finding, creating, and selecting the best data for your model or analysis. This is helpful for statistics and machine learning because using only your raw data might not be optimal for your model’s performance. \n\nJust because you have ‘big data’, that doesn’t mean you have to use it all. It would be kind of like if you were baking: you don’t want to grab all the ingredients in your kitchen and add them to the cake just because you have them there. Also, you might be able to get better results by engineering your features, which is changing or altering them in some way.\n\n ![Graph](image://4abdefeb-abf4-4d7b-966b-b3ac51c1cc21 \" \")\n\nThe ingredients you use will determine whether you end up with some delicious results, or a culinary disaster. So, put your chef hat on – let’s get started! ","64b07283-2520-4367-8f6e-c4d966e7909d",[773],{"id":774,"data":775,"type":55,"version":25,"maxContentLevel":28},"3fe45dea-4352-4293-9abb-d09a7d31fa18",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":776,"multiChoiceCorrect":778,"multiChoiceIncorrect":780},[777],"What is the process of finding, creating, and selecting the best data for your model or analysis called?",[779],"Feature engineering",[781,782,783],"Feature selection","Feature analysis","Feature optimization",{"id":785,"data":786,"type":25,"maxContentLevel":28,"version":25,"reviews":790},"e657d70d-0b4c-4421-919f-b4b7c681239f",{"type":25,"title":787,"contentRole":42,"markdownContent":788,"audioMediaId":789},"Ordinal encoding ","\nOrdinal encoding is not something you will have to do every time you run a statistical analysis or create a machine learning model. But, it is helpful to know, because many machine learning models require all inputs to be numeric. Ordinal encoding turns a categorical variable into a numerical one. \n\nWait, what? How is that even possible? Well, it’s a lot simpler than it might sound. For every option you have for your categorical variable, let’s say {‘High School’, ‘College’, ‘Bachelor’s Degree’, ‘Master’s Degree’, ‘PhD’} indicating your observation’s level of education, you create a column named after High School, ‘College’, ‘Bachelor’s Degree’ etc. So where your data previously looked like the table below:\n\n ![Graph](image://2fd065c5-989d-45df-8e69-74da4d945593 \" \")\n\nAfter assigning each category option a number, it looks like this:\n\n ![Graph](image://0f6118ec-f2a9-45eb-a518-2c170941be15 \" \")\n\nOrdinal encoding is simple, and easy to reverse. But, if your data is not ordinal in the first place, it will apply to an ordinal relationship where one does not exist. For example if your variable was instead car color or transport type. In this case, ‘one hot encoding’ might be a more suitable encoding option.\n\n","4316efa2-ce06-4a7c-a723-83ad9311064e",[791],{"id":792,"data":793,"type":55,"version":25,"maxContentLevel":28},"5134b0a0-422a-468c-adfd-5b81714c8548",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":794,"multiChoiceCorrect":796,"multiChoiceIncorrect":798},[795],"What encoding option should be used when the categorical variable is not ordinal?",[797],"One hot encoding",[799,800,801],"Binary encoding","Label encoding","Ordinal encoding",{"id":803,"data":804,"type":25,"maxContentLevel":28,"version":25,"reviews":808},"09771c7d-3792-4a8b-aae8-b2f87901c76c",{"type":25,"title":805,"contentRole":42,"markdownContent":806,"audioMediaId":807},"One hot encoding ","\nA common method used for feature encoding is called ‘one-hot encoding’. It is useful when your data is not ordinal, and you don’t want to use ordinal encoding and introduce ordinality where there is none. Basically, you take your categorical data that looks like this:\n\n ![Graph](image://0a149809-e2c2-49d6-accf-7c93dffd14b1 \" \")\n\nYou then turn it into something like the data in the table below, where 1 equals \"true” and 0 to “false”. Each person now has a numerical value for true or false depending on whether they selected that option as their preferred mode of transport, or not. \n\n\n ![Graph](image://cddfb13c-e783-4372-a501-ed4bf93adeeb \" \")\n\nThis is particularly useful because computers are designed to interpret binary data.\n\n","f3d00555-d274-42f5-9c3e-960bad50240f",[809],{"id":810,"data":811,"type":55,"version":25,"maxContentLevel":28},"a6f455af-418d-4662-bf85-83f2878b8b75",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":812,"clozeWords":814},[813],"One-hot encoding is a method used to turn categorical data into binary data.",[815],"binary",{"id":817,"data":818,"type":42,"version":25,"maxContentLevel":28,"pages":820},"1285ca22-b298-4f3e-a32f-dd1f8d3507b1",{"type":42,"title":819},"Feature Scaling Methods",[821,837,855,873,887,902,919],{"id":822,"data":823,"type":25,"maxContentLevel":28,"version":25,"reviews":827},"3f01f61e-73ea-49a2-ab54-46ce80256bad",{"type":25,"title":824,"contentRole":42,"markdownContent":825,"audioMediaId":826},"Feature Scaling ","\nFeature scaling is the process of altering your data in some way through normalization or standardization to achieve uniformity in the shape of your distribution. For example, it allows you to prescribe the minimum values, the maximum values, and the variance. It makes your distribution the same shape. \n\nFeature scaling is useful because whenever distances are used for calculations and conclusions within a machine learning algorithm, there is the possibility that one variable can dominate another due to its sheer scale, rather than importance. Some algorithms that benefit from feature scaling because they use euclidean distance as a measurement for comparison. The Euclidean distance is just the length of a straight line drawn between two points.\n\nFor example, age and salary are measured on very different scales. Age can reach just over 100, while for salary you could have multiple millions. The range of distances possible for one variable are much greater than they are for the other. \n\n ![Graph](image://02929be3-5537-45ba-af07-0e5538132874 \" \")\n\nSo, we feature scale to give every variable a fair chance at influencing results – to show us what really is most important, and were the relationships are. This stops the biggest bully in the dataset having all the say. ","d05057f2-f9ca-4e40-b86e-4733c78b3117",[828],{"id":829,"data":830,"type":55,"version":25,"maxContentLevel":28},"9fa0c8a0-80cb-4e01-87b4-b2f4567a1ef6",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":831,"binaryCorrect":833,"binaryIncorrect":835},[832],"What is the purpose of feature scaling?",[834],"To give every variable a fair chance at influencing results",[836],"To create uniformity in the shape of the distribution",{"id":838,"data":839,"type":25,"maxContentLevel":28,"version":25,"reviews":843},"593132b5-4ad5-42c8-ae17-5830ec87242d",{"type":25,"title":840,"contentRole":42,"markdownContent":841,"audioMediaId":842},"Normalization ","\nNormalizing the values in your distribution rescales them so that they are all between 0 and 1. While previously you might have had income data that ranged from $10,000 to $2,5,000,000, it creates an easier scale. However, by doing this, you will lose the outliers in your dataset. \n\nNormalization is otherwise known as min-max scaling, and by looking at the equation below, you will see why. You use the minimum and maximum values of your variable of interest to normalize each datapoint.\n\n ![Graph](image://eee48109-2b10-49ca-8bed-f980b5b338d5 \"The normalization or min-max scaling equation\")\n","49722eda-9c14-4cc5-b505-0e534962e77c",[844],{"id":845,"data":846,"type":55,"version":25,"maxContentLevel":28},"59ec4f2e-599f-4ca5-a6b2-63a6e3233acb",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":847,"multiChoiceCorrect":849,"multiChoiceIncorrect":851},[848],"What is the other name for the process of normalizing values in a distribution?",[850],"Min-max scaling",[852,853,854],"Standardization","Mean-centering","Z-scoring",{"id":856,"data":857,"type":25,"maxContentLevel":28,"version":25,"reviews":861},"0d9f8a6c-f46e-4cef-a808-2c74a382d6c6",{"type":25,"title":858,"contentRole":42,"markdownContent":859,"audioMediaId":860},"Pros and Cons of Normalization","\nYou should use normalization when your data is not normally distributed, and your model does not make assumptions about the distribution of your data. \n\nThere are some cons to normalization – for example, you will lose your outliers, which may have been important for an understanding of your data. You also lose your original values, they are still there, but on a different scale – so you can’t really interpret the new values in terms of the original measurement variable, like feet or liters. \n\n\n ![Graph](image://c1da599e-bd1d-4b48-8ad0-9c90d9f4ffd8 \"x\")","c4a0730c-8009-494d-a240-3cf4fcf78227",[862],{"id":863,"data":864,"type":55,"version":25,"maxContentLevel":28},"a5bb02ad-f3ee-431b-86b9-877e24ee0f4b",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":865,"multiChoiceCorrect":867,"multiChoiceIncorrect":869},[866],"What are some of the drawbacks to using normalization?",[868],"Loss of outliers and loss of original values",[870,871,872],"Loss of accuracy and loss of precision","Loss of detail and loss of context","Loss of meaning and loss of data",{"id":874,"data":875,"type":25,"maxContentLevel":28,"version":25,"reviews":879},"9daa75ec-bd85-41c7-a69b-3a264efaadb0",{"type":25,"title":876,"contentRole":42,"markdownContent":877,"audioMediaId":878},"Standardization ","\nStandardizing your data rescales your data to conform to the standard normal distribution. That means it will have a mean of 0 and a standard deviation of 1. It’s useful when the model you intend to use requires that your data be normally distributed and have similar scales. We do this by subtracting the sample mean from each datapoint, and dividing that by the standard deviation. \n\n ![Graph](image://67b82788-e728-44d1-bad1-d09e61ef4d10 \" \")","444378e5-b465-4aa1-ae18-a6e48d34980e",[880],{"id":881,"data":882,"type":55,"version":25,"maxContentLevel":28},"7a39fc8a-44ee-4e92-8f9e-db23f947078f",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":883,"activeRecallAnswers":885},[884],"How do we standardize data to conform to the standard normal distribution?",[886],"By subtracting the sample mean from each datapoint, and dividing that by the standard deviation",{"id":888,"data":889,"type":25,"maxContentLevel":28,"version":25,"reviews":893},"8a3821f5-b439-4288-ad07-108cd101663f",{"type":25,"title":890,"contentRole":42,"markdownContent":891,"audioMediaId":892},"Pros and Cons of Standardization ","\nYou should also use standardization as opposed to normalization when your data is normally distributed, or has outliers. This is because with normalization, you will lose your outliers.\n\nThere are some cons of standardiazation – for example, you lose your original values. They are still there, but on a different scale – so you can’t really interpret the new values in terms of the original measurement variable, like centimetres or dollars. \n\nStandardization should also be used if you plan to do statistical tests like the Analysis of Variance – ANOVA, and use models like regularized linear and logistic regression which assume that your residuals – the distances between your line of best fit and your values – are normally distributed. \n\n\n ![Graph](image://0ceb44e9-8a41-4b38-843b-2f05919c498a \"x\")","2c6306e4-6680-47fe-ad7d-95978fd7bda4",[894],{"id":895,"data":896,"type":55,"version":25,"maxContentLevel":28},"ae52c729-8f1e-440e-854c-e57bf38eb16a",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":897,"binaryCorrect":899,"binaryIncorrect":900},[898],"What should be used when data is normally distributed or has outliers?",[852],[901],"Normalization",{"id":903,"data":904,"type":25,"maxContentLevel":28,"version":25,"reviews":908},"ed686274-6d28-48ee-8edf-d5daee4ae838",{"type":25,"title":905,"contentRole":42,"markdownContent":906,"audioMediaId":907},"Which models / algorithms need feature scaling ","\nThere are many models that work by computing distances between data points – if the scales used vary then the results obtained from these models won’t be accurate. This is why we scale our data via normalization or standardization to create uniformity between variables. \n\nSome examples of models that rely on computing distance include: K Nearest Neighbors – KNN – which is a supervised machine learning algorithm that classifies/categorizes new data based on its distance to existing data clusters for which we already know the category. \n\nSupport Vector Machines – SVM – which are also a supervised algorithm that uses distance to separate, group, and classify data points. And finally, K-means clustering –  which is an unsupervised machine learning algorithm, meaning you don’t need to have labelled data, it will find patterns in the data for you based on distances. \n\nOther examples of algorithms sensitive to variables with different ranges include dimension reduction algorithms such as Principal Components Analysis. \n\n\n ![Graph](image://2f1b9978-bae9-4b46-a2b6-9180090eba91 \"x\")","c7b8df35-7195-481e-8295-f80d712dbdf3",[909],{"id":910,"data":911,"type":55,"version":25,"maxContentLevel":28},"bbb10bf6-a4d4-45ce-9ba2-8ec97dcda7de",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":912,"multiChoiceCorrect":914,"multiChoiceIncorrect":916},[913],"What is the process of making variables have the same range called?",[915],"Feature scaling",[917,782,918],"Feature normalization","Feature reduction",{"id":920,"data":921,"type":25,"maxContentLevel":28,"version":25,"reviews":925},"a1d29c9e-43b3-4482-a4ec-c57f1814d729",{"type":25,"title":922,"contentRole":42,"markdownContent":923,"audioMediaId":924},"Dummy Encoding ","\nDummy encoding is used for regression models that are used to make predictions about one value based on another value, when one of those values is a categorical variable. For example, you could predict your exam score, which is a continuous variable, based on your favorite Kinnu tile, which is a categorical variable. This is because without dummy encoding, the correlation coefficient for the model cannot be calculated.\n\nDue to the fact that a regression analysis requires a numerical value as the input – we need to transform our categorical variable to an integer. Dummy encoding enables us to do that. For example, let’s say our data looks like this:\n\n ![Graph](image://ae504abe-393a-4ed4-94c6-ded3ddc928e9 \" \")\n\nOnce dummy encoding has been performed on the data above, our data will now be represented numerically like so:\n\n\n ![Graph](image://82c8d8db-3198-4200-b8c1-aae7b6f20cb6 \" \")\n\nBut, you might notice that one of our options is missing. Where did ‘Private Jet’ go? No, it didn’t take off to the Maldives! It went missing because if all fields equal 0 for the other three columns, we know that the value for ‘Private Jet’ must be ‘true’ – 1. This is the case with Elon in our data above. It is this step that is crucial for enabling the calculation of correlation coefficients in regression models. \n\n","25d2d67b-f3d5-457b-8385-73e8794bf8f7",[926],{"id":927,"data":928,"type":55,"version":25,"maxContentLevel":28},"5b29a227-75dd-4c30-ba81-54c62aeb2c4e",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":929,"multiChoiceCorrect":931,"multiChoiceIncorrect":933},[930],"What is the purpose of dummy encoding in a regression model?",[932],"To transform a categorical variable to an integer",[934,935,936],"To calculate correlation coefficients","To enable the calculation of correlation coefficients","To represent data numerically",{"id":938,"data":939,"type":30,"maxContentLevel":28,"version":25,"orbs":942},"2d5d00ca-2884-40ca-8ab4-9f1b0514c002",{"type":30,"title":940,"tagline":941},"Probability ","Learn how to model and predict the world around you – from card games to medicine.",[943,999,1034,1061],{"id":944,"data":945,"type":42,"version":25,"maxContentLevel":28,"pages":947},"f9513546-54b5-4d50-a4f1-13492108bdd9",{"type":42,"title":946},"Foundations of Probability",[948,966,984],{"id":949,"data":950,"type":25,"maxContentLevel":28,"version":25,"reviews":954},"c3239cdb-5339-4c79-b4ff-07695e2c570f",{"type":25,"title":951,"contentRole":42,"markdownContent":952,"audioMediaId":953},"What is Probability ?","\nHave you ever wanted to predict an event, like who will win the election, or how likely you can flip a coin and get heads three times in a row? \n\nProbability helps you answer such questions. While many events can’t be predicted with complete certainty, when done correctly, probability methods enable us to predict the likelihood of events with enough reliability to inform our decision making. \n\nProbability values always range from 0 to 1, where 0 is ‘never going to happen’ and 1 is ‘you bet, it will definitely happen’. In the middle, we have 0.5, which is an even chance of the event happening, or not happening. \n\nWhen it comes to predicting events, probability is just a guide. For example, strictly speaking, if you flip a coin 10 times you should expect to get heads 5 of those flips. Go ahead and try it though, because in reality it doesn’t always work out this way. You might get 7 heads, or even 10. \n\nProbability is used to make predictions in fields like weather forecasting, medicine, insurance, and more.\n\n\n ![Graph](image://77667267-8a8c-4f45-9cea-d55e616a7dbd \"x\")","3db214dd-a8e6-4a21-9dc3-3d6fc279b57c",[955],{"id":956,"data":957,"type":55,"version":25,"maxContentLevel":28},"3a111f19-4080-4eaa-bfce-9a6ea69316c3",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":958,"multiChoiceCorrect":960,"multiChoiceIncorrect":962},[959],"What is the range of values that probability values can take?",[961],"0 to 1",[963,964,965],"0 to 10","0 to 100","0 to 0.5",{"id":967,"data":968,"type":25,"maxContentLevel":28,"version":25,"reviews":972},"339fdabe-8bb2-4877-9acc-ec2e95f08fd9",{"type":25,"title":969,"contentRole":42,"markdownContent":970,"audioMediaId":971},"Introduction to frequentist probability ","\nHave you ever wondered how likely it is that something will happen? Maybe you wondered how likely it was you could flip a coin and get heads 5 times in a row? It’s possible you were even trying to win a bet. Probability can help you with that.  \n\nThe two main schools of probability thought are frequentist probability, and Bayesian probability. Here we will focus on the frequentist way of thinking.\n\nFrequentist methods use large samples and statistical methods to generate probability distributions, and from those, make predictions about the probability of an event. For example, to find the probability of getting heads when flipping a coin, a frequentist would flip that coin 100000 times and count how many of them it landed on heads. \n\nWhen you want to find a general trend in data that you’ve already collected, frequentist statistics typically produce more reliable results than bayesian statistics. Also, when you need to test multiple hypotheses at once – like determining whether there is enough evidence to support two competing hypotheses, then frequentist statistics is typically preferable.\n\n\n ![Graph](image://bed48557-49e1-4ac9-89d1-2348ce32ad9a \"x\")","193ee3d6-8616-4524-88ce-0de808c61f28",[973],{"id":974,"data":975,"type":55,"version":25,"maxContentLevel":28},"78d3e63e-8bf2-4fc7-8687-279d1f3a3093",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":976,"multiChoiceCorrect":978,"multiChoiceIncorrect":980},[977],"What type of statistics is typically used when wanting to find a general trend in data that has already been collected?",[979],"Frequentist statistics",[981,982,983],"Bayesian statistics","Inferential statistics","Descriptive statistics",{"id":985,"data":986,"type":25,"maxContentLevel":28,"version":25,"reviews":990},"061dec8f-785a-4634-bfe6-ba51b31fe870",{"type":25,"title":987,"contentRole":42,"markdownContent":988,"audioMediaId":989},"Introduction to bayesian probability ","\nBayesian probability can be used to estimate the likelihood of an event based on your current knowledge and prior beliefs. As an example, if you know that people with red hair are more likely to be left-handed, then you can use the Bayes' Theorem to estimate the likelihood of left-handedness given the observed data. Bayesian probability provides a framework for understanding how chance affects outcomes in complex systems.\n\nAs an example, when flipping a coin, a bayesian would say that we know that there are two equally likely possible outcomes, so the chance of getting heads is 50% – we don’t need to flip the coin 100000 times. \n\nBayesian probability actually goes much deeper than that, though, it involves forming prior assumptions based on known observations or estimates, and updating probability estimates based on new evidence. \n\nFor example, Bayesian probability tells you the chance that you have a disease, given that you tested positive for it. This takes into account evidence, such as the base rate of the disease in the population, as well as the test’s accuracy. Bayesian statistics is useful for things like rare diseases, where finding large samples for frequentist methods is just not possible. \n","bf6a4f0f-0c4d-47cd-a040-b6b9af751ac5",[991],{"id":992,"data":993,"type":55,"version":25,"maxContentLevel":28},"5ed8bc89-3412-4245-8924-9b94258fed57",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":994,"clozeWords":996},[995],"Bayesian probability can be used to estimate the likelihood of an event, given observed data.",[997,998],"likelihood","observed",{"id":1000,"data":1001,"type":42,"version":25,"maxContentLevel":28,"pages":1003},"f5ffd4fd-c451-4b95-a0fe-ce8c53c40b8f",{"type":42,"title":1002},"Probability Operations",[1004,1018],{"id":1005,"data":1006,"type":25,"maxContentLevel":28,"version":25,"reviews":1010},"0809cc2d-6758-4a1a-8a75-dcd3c22c7d4a",{"type":25,"title":1007,"contentRole":42,"markdownContent":1008,"audioMediaId":1009},"Unions, Complements, and Intersections","\n ![Graph](image://a38b07c0-b089-4e3e-b642-29245278e522 \" \")\n\nIntersections are when two events occur together, and are typically denoted by the intersection symbol (with the common way of expressing an interaction’s probability below):\n\n ![Graph](image://1caab321-bda1-4886-9a8c-8177f7fa2ac0 \"The intersection symbol (above), and the expression for an intersection's probability (below)\")\n\nBut sometimes you might see it written as P(A and B) or just P(AB). The intersection is shown in the overlapping of the circles in our venn diagram above. As an example, the probability that a card drawn from a deck is both red, and a Queen. \n\nUnions are when either event A, or B, or both occur. Unions are typically denoted by the union symbol symbol (with the common way of expressing an interaction’s probability below): \n\n\n ![Graph](image://07743de3-9802-438e-94fb-70afe8bc274f \"The union symbol (above), and the expression for an union's probability (below)\")\n\nThe union is visualied as the entire area of the two circles in our venn diagram above, minus the area of the intersection. An example would be drawing a red card or a queen. The equation to find the union looks like this:  \n\n ![Graph](image://d8d8fe5e-88e4-4ce3-a1e1-1cadad0418f1 \" \")\n\n","f71916ea-dd99-4205-8542-b24a7ecbff33",[1011],{"id":1012,"data":1013,"type":55,"version":25,"maxContentLevel":28},"220c050a-0ea9-4dc9-8bd4-f2bf5e8e122c",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":1014,"activeRecallAnswers":1016},[1015],"P(AB) is one way of representing the probability of what kind of relationship?",[1017],"An intersection",{"id":1019,"data":1020,"type":25,"maxContentLevel":28,"version":25,"reviews":1024},"4e40618d-dde2-4c45-8a68-9c0ff222d06c",{"type":25,"title":1021,"contentRole":42,"markdownContent":1022,"audioMediaId":1023},"Complements"," ![Graph](image://09985913-48b8-4b34-8128-e1ef5fd84f00 \" \")\n\nComplements are all of the events outside of A, and are typically signified by A’. Due to the fact that the sample space probability adds up to 1, the probability of A’ is equal to 1 - P(A). \n\nTo take a useful medical example, imagine we have a union between diabetes and hypertension – the probability that the person has neither of these is 1 -  P(A U B). In this way, by knowing the probability of an event, or events, you can easily calculate the probability of an event not occurring, too. \n\nAs an example, if A is the probability of you drawing a red card from a deck of cards, then A’ is the probability that you draw anything other than a red card. \n\nGiven there are 26 red cards in a deck of cards, the probability you draw one is 26/52 which equals .5 – so A’ = 1 - .5 = .5\n\n","8b2a2b2d-c800-42aa-b3e1-8e94c404ed0f",[1025],{"id":1026,"data":1027,"type":55,"version":25,"maxContentLevel":28},"63f66e59-18c3-4761-bd89-1ba245454d92",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":1028,"binaryCorrect":1030,"binaryIncorrect":1032},[1029],"What is the probability of drawing a card from a deck of cards that is not red?",[1031],"0.5",[1033],"0.25",{"id":1035,"data":1036,"type":42,"version":25,"maxContentLevel":28,"pages":1038},"23b2f638-48fe-41ee-91ee-5147d2366a1a",{"type":42,"title":1037},"Conditional Probability",[1039,1055],{"id":1040,"data":1041,"type":25,"maxContentLevel":28,"version":25,"reviews":1045},"b4e7e33b-05d4-46d7-8960-ecc9f08acc46",{"type":25,"title":1042,"contentRole":42,"markdownContent":1043,"audioMediaId":1044},"Introduction to Conditional Probability "," ![Graph](image://37402960-471e-4c3a-a618-d4546fb107cd \" \")\n\nConditional probability is perhaps easiest to visualize first – you’ve come across a venn diagram before, right? Well let’s just wrap that in a bigger circle which represents our sample space of all possible outcomes. \n\nWhen it comes to conditional probability, we are narrowing our potential set of outcomes. Given that B has occurred, that narrows our sample space and excludes anything shown in red. It also reduces the probability that A can occur, which is why we only have a small piece of overlap in the venn diagram.\n\nIf we want to know what the probability of A occuring, given that B has already occurred, the formula to do so looks like this, where the formula in the numerator is the intersection of events A and B, the chance they happen together: \n\n ![Graph](image://116cb5ed-9899-4265-8ae2-5ac7d6a5af50 \" \")\n\nP(B|A) means the probability of event B occurring, given that A has already occurred. \n\n\n ![Graph](image://e9da21a4-26c2-4e88-a470-1fdfaa3e1ef2 \" \")\n\nAs an example, what is the probability that somebody is a Kinnu user given that you know they prefer cats? That would be P(Kinnu user|prefers cats) = 12/36 = 33.3% \n\n","42f93442-ea61-4a55-8f9a-cf838c1989a4",[1046],{"id":1047,"data":1048,"type":55,"version":25,"maxContentLevel":28},"8eeb5f84-f9e5-47da-aa79-b9008aca171f",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":1049,"binaryCorrect":1051,"binaryIncorrect":1053},[1050],"What is the equation for the probability of an event occurring, given that another event has already occurred?",[1052],"P(B|A)",[1054],"P(2AB)",{"id":1056,"data":1057,"type":25,"maxContentLevel":28,"version":25},"457d1844-93e6-49d0-a3e8-f8adf59f856a",{"type":25,"title":1058,"contentRole":42,"markdownContent":1059,"audioMediaId":1060},"Conditional probability calculations","\nConditional probability is the chance that one event will happen, given that another event has already occurred. It is calculated via the formula below. \n\n ![Graph](image://0659704c-ad61-43a8-abab-e28cb2ff4c41 \" \")\n\nLet’s say that 50% of Kinnu users study data science, and 30% study history. 15% study both history and data science. Yesterday you met a random Kinnu user on the street, and you wondered if they do the history pathway like you do. They mentioned that they take data science, but they didn’t mention anything about history. So what are the chances they do? \n\nWe know the probability they take history is 0.3 – let’s call that event B. And the probability they are learning data science is 0.5 – that’s event A. The probability they do both, is .15 – which is given as the intersection in our formula above, in the numerator. \n\n\n\n ![Graph](image://70fbcbe6-7a43-4d25-aab4-f3a121301733 \" \")\n\n\nSo, it turns out there is a 30% chance that your new friend takes history as well. Next time you see them you can ask about it, but it's not very likely that they do. \n\n","35f32834-89cb-4419-bf10-c546c4985bce",{"id":1062,"data":1063,"type":42,"version":25,"maxContentLevel":28,"pages":1065},"32323060-7243-42c1-8861-ae09ed4e95f4",{"type":42,"title":1064},"Probability Events",[1066,1080],{"id":1067,"data":1068,"type":25,"maxContentLevel":28,"version":25,"reviews":1072},"1a884b29-b050-4d7d-8a59-4ac8681966a3",{"type":25,"title":1069,"contentRole":42,"markdownContent":1070,"audioMediaId":1071},"Mutually Exclusive Events","\n ![Graph](image://40603feb-0ee0-4092-b133-e6eb9ae3b37b \"A coin flip is a classic disjoint event\")\n\n\nSometimes also called ‘disjoint events’, mutually exclusive events just cannot happen together. For example, imagine that you flip a two sided coin, once. It's impossible for the flip to be both heads and tails on a single coin flip. Let’s take it one step further though, instead of two possible events, you now have three. You have a handful of delicious, tasty, and colorful M&Ms. Yum. \n\nFor simplicity, let’s say we only have three colors left, somebody ate all the rest – not the Kinnu team, I swear! We know that there are 10 M&Ms total – 3 red, 3 green, and 4 blue. \n\nThat means our probabilities of picking each are 0.3, 0.3, and 0.4 – but what if we want to know our probability of picking a red OR a green M&M. When we have mutually exclusive events, it’s easy to calculate the probability of a problem like this. All we do is add up the two probabilities like so: \n\n\n ![Graph](image://d44021b9-9dba-4541-b3a5-50bfe326de9e \" \")\n\nThe U symbol that you see means Union – and means that either A or B occurs, but not both. \n\n","e971ad86-79ec-4022-a92b-c2c8958387ac",[1073],{"id":1074,"data":1075,"type":55,"version":25,"maxContentLevel":28},"0a998639-a198-4aab-aa3b-37aab7592fab",{"type":55,"reviewType":25,"spacingBehaviour":25,"activeRecallQuestion":1076,"activeRecallAnswers":1078},[1077],"What is the term for the probability that either A or B occurs, but not both?",[1079],"Union",{"id":1081,"data":1082,"type":25,"maxContentLevel":28,"version":25,"reviews":1086},"08ec1a70-9038-4a29-bfae-df7cd10a8f24",{"type":25,"title":1083,"contentRole":42,"markdownContent":1084,"audioMediaId":1085},"Frequentist Statistics ","\n\n\nA simple example to understand frequentist statistics is to imagine that we didn’t know the probabilities associated with a coin flip. Maybe the coin is weighted heavier on one side which changes the odds? How would we uncover the underlying probability? \n\nAs frequentists rely purely on the ‘what is’ of data, meaning, what has already been observed, to a frequentist, probability is what has been observed in the frequencies of different outcomes from a series of controlled trials. Frequentist statistics uses something called Maximum Likelihood Estimation ‘MLE’. \n\n ![Graph](image://f514a859-c8d8-4a15-bb47-6bc6eec67a87 \" \")\n\nA frequentist coin flip experiment involves hundreds, thousands, or even hundreds of thousands of coin flips – each time the result is recorded, and at the end of the experiment the total number of heads is divided by the total number of coin flips. That is our estimate of the probability of flipping heads, and only after a large number of trials can we estimate it with confidence. \n\nBut, frequentist probability has its limitations, because it requires that events be repeatable. However some things just aren’t repeatable, like elections, or a world cup soccer match. Sure, there is an election every year, and a world cup match every 4 years  – but not the same election or the exact same world cup match. For events like these, a Bayesian approach is more reliable. ","f763eb33-50dc-4639-8cf4-e2f087a04cba",[1087],{"id":1088,"data":1089,"type":55,"version":25,"maxContentLevel":28},"deb31257-d9ff-4c73-9563-1c9a94d7c15b",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":1090,"multiChoiceCorrect":1092,"multiChoiceIncorrect":1094},[1091],"What is the name of the method used by frequentist statistics to estimate probability?",[1093],"Maximum Likelihood Estimation (MLE)",[1095,1096,1097],"Maximum Likelihood Analysis (MLA)","Maximum Likelihood Probability (MLP)","Maximum Likelihood Calculation (MLC)",{"id":1099,"data":1100,"type":30,"maxContentLevel":28,"version":25,"orbs":1103},"3b078f23-f5f2-4253-acf7-b1d8122c745a",{"type":30,"title":1101,"tagline":1102},"Bayesian Probability ","Calculate advanced conditional probabilities using Bayes' Theorem",[1104,1171],{"id":1105,"data":1106,"type":42,"version":25,"maxContentLevel":28,"pages":1108},"5f8c49cc-4d5e-41a3-b288-6a5762aa6aac",{"type":42,"title":1107},"Understanding Bayes' Theorem",[1109,1124,1139,1157],{"id":1110,"data":1111,"type":25,"maxContentLevel":28,"version":25,"reviews":1115},"19d539e2-414c-44a3-80e9-c374be21623e",{"type":25,"title":1112,"contentRole":42,"markdownContent":1113,"audioMediaId":1114},"The difference between Bayes’ Theorem and Conditional Probability ","\nConditional probability tells you the probability of one event happening based on whether another event is true or not. Conditional probability is used for simpler problems. Bayes' Theorem is a structured formula for more complicated problems, with the ability to update as new information comes in. \n\nBut because Bayes’ Theorem contains a conditional probability in the numerator, it is important to understand how to calculate conditional probability in order to calculate Bayes' Theorem. \n\nThe equation for Conditional Probability is as follows:\n\n\n ![Graph](image://400a92ca-30d6-41b2-949d-2079f209c30f \" \")\n\nHowever, the formula for Bayes’ Theorem includes two conditional probabilities – one in the numerator, and one to be calculated – and looks like this: \n\n\n ![Graph](image://b76d26c0-1aac-47e5-a759-94a09a451c71 \" \")\n\nBayes' Theorem can be used for problems such as knowing the probability that you have a disease given you got a positive test, when you need to take other information into account like the base rate of the disease in the population, and the test’s accuracy – which we use to update our prior probability. \n\n","616d9674-08d0-4659-b97b-dc54f02eaf35",[1116],{"id":1117,"data":1118,"type":55,"version":25,"maxContentLevel":28},"a663aa32-7721-4662-8294-317a3fcd96a7",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":1119,"binaryCorrect":1121,"binaryIncorrect":1123},[1120],"What kind of probability tells you the probability of one event happening based on whether another event is true or not?",[1122],"Conditional probability",[1069],{"id":1125,"data":1126,"type":25,"maxContentLevel":28,"version":25,"reviews":1130},"bdef6577-98b4-4a66-83a0-3168d3494454",{"type":25,"title":1127,"contentRole":42,"markdownContent":1128,"audioMediaId":1129},"About Bayes’ Theorem ","\n\n\nBayes’ Theorem is an extension of Conditional Probability, and at its core allows us to update the predicted probabilities of an event through incorporating new information, allowing for dynamic calculation of probabilities. It is calculated via the following formula, where P(A|B) is the probability of event A given that B has already happened. \n\n\n ![Graph](image://ecf71fbd-782e-40fe-9861-0ee5c4bd3e11 \" \")\n\n\nToday, we have found uses for it in spam detection, risk detection, and more. \n\nAlgorithms such as Naive Bayes allow us to classify the emotional sentiment of text social media posts, or even recommend films we might like to watch next. Bayesian Neural Networks allow us to forecast stock markets, or perform facial recognition tasks. \n\nPut simply, Bayes' Theorem takes a result A and relates it to the conditional probability of that result given other related events. When false positives are involved, Bayes' Theorem gives a more accurate assessment of risk. As an example, in medical testing where a positive result does not tell you your chances of having a disease without adjusting for the base rate of the disease in the population as well as the test's accuracy.\n\n\n\n","272db205-5c5b-4a23-903a-740f6d8ef90c",[1131],{"id":1132,"data":1133,"type":55,"version":25,"maxContentLevel":28},"1bc1892c-017d-4247-b154-be0db536d8bc",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":1134,"clozeWords":1136},[1135],"Bayes' Theorem is used to calculate the probability of event A given related events.",[1137,1138],"event A","related",{"id":1140,"data":1141,"type":25,"maxContentLevel":28,"version":25,"reviews":1145},"0a891f8b-a016-4360-be55-39ff03682b20",{"type":25,"title":1142,"contentRole":42,"markdownContent":1143,"audioMediaId":1144},"Bayes' Theorem Priors, Posterior, and Likelihoods ","\n\n\nThe core idea behind Bayesian probability is to start with an initial belief, and become less wrong about our belief by updating it with new information as new information becomes available. Our initial belief is called the prior probability – P(A). For example, the base rate of disease prevalence in a population. \n\nWe get the posterior probability – P(A|B), for example the probability you have a disease given that you tested positive for it – by updating our initial belief with new information such as the likelihood – P(B|A), and P(B) – which in this example is the probability of getting a positive test whether you have the disease or not. P(B) is equal to:\n\n ![Graph](image://fea75144-82e1-4e5e-8c05-ca0e85e29be8 \" \")\n\nThe likelihood tells you the likelihood you will get result B, given that we already know that A is true. For example, this is the probability of getting a positive test result if you do in fact have the disease. Which in our example, is the test’s accuracy. ","79391bc7-62cf-4261-8b60-87402ea47b09",[1146],{"id":1147,"data":1148,"type":55,"version":25,"maxContentLevel":28},"d478b4f2-9fab-4d54-be6a-4d783e853ac9",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":1149,"multiChoiceCorrect":1151,"multiChoiceIncorrect":1153},[1150],"What is the initial belief called in Bayesian probability?",[1152],"Prior probability",[1154,1155,1156],"Posterior probability","Likelihood","Base rate",{"id":1158,"data":1159,"type":25,"maxContentLevel":28,"version":25,"reviews":1163},"34e03dea-e606-4b01-8520-716474919e96",{"type":25,"title":1160,"contentRole":42,"markdownContent":1161,"audioMediaId":1162},"Bayes' Theorem example ","\nPut simply, Bayes' Theorem takes a result A and relates it to the conditional probability of that test result given other related events. When false positives are involved, Bayes' Theorem gives a more accurate assessment of risk. \n\nIf a medical test that was 99% accurate returned a positive test for a rare and deadly disease for your pet, that affects 1% of the population, does that mean there is a 99% chance your pet has the disease? Actually, it doesn’t. \n\nLet’s take a sample of 10,000 pets to show why. \n\nFirst, we know that P(disease) = .01\n\n ![Graph](image://83f9913f-1192-4d16-9bc7-786c0348b11d \" \")\n\nWe also know that the test is 99% accurate, therefore the 1 percent error rate will mean that out of the 100 diseased pets in our sample of 10,000 – 99 will test positive and one will not, this means that P(tested positive | has disease) = .99\n\nBut the error rate also means that 99 of the 9,900 non-diseased pets will test positive, too. \n\nTherefore P(positive test) = (99+99) / 10,000 = 0.0198 – the formula to calculate for our example is as follows:\n\nP(has disease | tested positive) = [P(positive test | has disease) * P(has disease)] / P(positive test) \n\nThus, P(has disease | tested positive) = (0.99 x 0.01) / 0.0198 = 0.50 = 50%.\n\nThis means that conditional on a positive test, there is not a 99% chance, but a 50% chance your pet has the disease. \n\n","33d2ddb2-e45d-405e-a888-19fd952b94c7",[1164],{"id":1165,"data":1166,"type":55,"version":25,"maxContentLevel":28},"e64ac7e8-3ab0-4d2b-9f13-936e5e5a85a3",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":1167,"clozeWords":1169},[1168],"When false positives are involved, Bayes' Theorem gives a more accurate assessment of risk.",[1170],"false positives",{"id":1172,"data":1173,"type":42,"version":25,"maxContentLevel":28,"pages":1175},"c5361840-13be-4c38-85f7-b93063bf6d97",{"type":42,"title":1174},"Probability Concepts",[1176,1192,1208,1226],{"id":1177,"data":1178,"type":25,"maxContentLevel":28,"version":25,"reviews":1182},"4bf02c6c-b239-4413-8360-ccfeab26d499",{"type":25,"title":1179,"contentRole":42,"markdownContent":1180,"audioMediaId":1181},"Joint Probability","\n\n\nJoint probability is the intersection of 2 or more events. It tells you the chance that all events will occur simultaneously. However, it is important to note that the events must be independent of one another.\n\nThe best example of this is rolling two dice. How likely is it that you will roll two sixes? Neither event can influence the other; one die landing with 6 facing up does not influence the other die. \n\nTo calculate joint probability you will need to multiply the probability of event A, by the probability of event B. Of course, to do that you first need to know the probability of both A and B. \n\nFor a rolling two sixes in a game of dice, the probability of rolling a six is equal to: \n\n![Graph](image://9fba4a62-71b1-4746-9c15-6c56f8ba94e9 \" \")\n\n\nThen, to calculate the probability of rolling two sixes, we simply multiply our probabilities together, like so: \n\n ![Graph](image://50c412bf-dd51-4ef8-9ed3-4bebc3be840e \" \")\n\n\nTo get the percentage chance, multiply your answer by 100. \n\nSo the probability of rolling two sixes is only 2.77%! \n\n\n","dc84fb85-2119-457c-a40e-d87473a7dc76",[1183],{"id":1184,"data":1185,"type":55,"version":25,"maxContentLevel":28},"3236e192-816a-4e84-89ab-8ee9da044849",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":1186,"binaryCorrect":1188,"binaryIncorrect":1190},[1187],"What is the probability of rolling two sixes in a game of dice?",[1189],"2.77%",[1191],"6.77%",{"id":1193,"data":1194,"type":25,"maxContentLevel":28,"version":25,"reviews":1198},"237e80a5-dcef-4764-878e-beee7f3930df",{"type":25,"title":1195,"contentRole":42,"markdownContent":1196,"audioMediaId":1197},"Sampling with replacement "," ![Graph](image://54e253d8-fbbf-4781-9bf2-d6089dea8bf8 \" \")\n\nSampling with replacement is an important concept to understand when it comes to probability. This is because the concept can be used to help improve the quality of our point estimates like sample means via bootstrapping.\n\nLet’s deal with an M&M’s factory – we want to know how many of each color the factory produces. Instead of counting them all, we take one out of a big bucket, record the color of it, and then place it back. We do this thousands of times, until we have the frequencies for each color. \n\nThe reason we place it back is so that the next trial is not influenced by the previous trial. This is because by completely removing a green M&M and eating it, we forever change the probability of drawing a green M&M from the bucket – there is one less now so it is less likely. \n\nIn fact, with sampling with replacement, you could take out the same green M&M hundreds of times. \n\nSampling with replacement is useful when you need to get an idea of the frequencies within a population, but couldn’t possibly count every single M&M at the factory!\n\n","75ad38b4-55bd-4db2-abb3-199e82280e17",[1199],{"id":1200,"data":1201,"type":55,"version":25,"maxContentLevel":28},"460df884-51ee-4c6c-a68c-49f0a47524f5",{"type":55,"reviewType":42,"spacingBehaviour":25,"binaryQuestion":1202,"binaryCorrect":1204,"binaryIncorrect":1206},[1203],"What is the purpose of sampling with replacement?",[1205],"To get an idea of the frequencies within a population without counting every single item",[1207],"To permanently remove items from a population",{"id":1209,"data":1210,"type":25,"maxContentLevel":28,"version":25,"reviews":1214},"84a41184-fe4a-4bbe-a0a3-99ba53f19e54",{"type":25,"title":1211,"contentRole":42,"markdownContent":1212,"audioMediaId":1213},"Bootstrapping ","Bootstrapping, through sampling with replacement, helps us measure uncertainty surrounding our point estimate, for example the sample mean, which is an estimate of the true population average.\n\nWhen bootstrapping, our sample gets treated as if it were the population. And we take a sub-sample from that, and calculate the statistics of interest on that sub-sample. We then get a probability distribution of the sample means from each sample. \n\nThe key thing here is that your subsample is the exact same size as your original sample. If you’re wondering how that is possible without using the exact same sample each time, it is possible because each individual observation is sampled with replacement. This means that person 5, your observation, can be in your sub-sample any number of times, it could be 0, or it could be 10 times. Like in bootstrap sample 3 below, observation 5 was sampled twice.\n\n\n ![Graph](image://5031c2cf-9f14-45d0-9fb9-3fe0b0a3a009 \" \")\n\nThrough bootstrapping, we can save a lot of time and money by not needing to gather entirely new samples, yet we can still get a reliable estimate of population parameters. \n\n","757bc3fd-ceb4-4706-abfa-20322d6ce78c",[1215],{"id":1216,"data":1217,"type":55,"version":25,"maxContentLevel":28},"b8818947-ebe1-451d-b79e-3673a5c17d0b",{"type":55,"reviewType":28,"spacingBehaviour":25,"multiChoiceQuestion":1218,"multiChoiceCorrect":1220,"multiChoiceIncorrect":1222},[1219],"What is the key feature of bootstrapping that allows us to save time and money?",[1221],"Sampling with replacement",[1223,1224,1225],"Sampling without replacement","Sub-sampling","Calculating statistics",{"id":1227,"data":1228,"type":25,"maxContentLevel":28,"version":25,"reviews":1232},"d3ba364f-12c8-405b-8efd-78ad8500695e",{"type":25,"title":1229,"contentRole":42,"markdownContent":1230,"audioMediaId":1231},"Unconditional Probability ","\nUnconditional probability is otherwise known as ‘marginal probability’, and means that the next event is completely independent of the previous event. \n\nBase rates, like the percentage of days in a year that it rains, are one form of unconditional probability. This is because they are not conditioned on any other event, like the presence of clouds. \n\nAnother simple example of unconditional probability is the coin toss. Assuming no outside forces are acting on the coin, no matter how many times we toss a coin, we know what the probability of getting heads, or rolling a 2 will be. This is unlike drawing from a deck of cards without replacing the card you just drew, which changes the future probabilities irrevocably. \n\nHow do you calculate unconditional probability? Easy! \n\n\n ![Graph](image://7de30195-1696-4f2b-b364-3747e91a1942 \" \")\n\nFor a slightly more complicated example compared to our coin flip… there are 4 aces in a deck of 52 cards, so to pull an ace out of a fresh deck of cards, your calculation will look as follows:\n\n ![Graph](image://61629131-b9c7-4c07-b039-1eaf22adcffe \" \")\n\n","cd6330ea-7ba4-4182-85ac-eafd74fcc73b",[1233],{"id":1234,"data":1235,"type":55,"version":25,"maxContentLevel":28},"b7d68e08-5027-423c-adf5-a19151e3bb0b",{"type":55,"reviewType":21,"spacingBehaviour":25,"clozeQuestion":1236,"clozeWords":1238},[1237],"Unconditional probability is calculated by dividing the number of times 'A' can occur by the total number of possible outcomes .",[1239],"Unconditional probability",{"left":4,"top":4,"width":1241,"height":1241,"rotate":4,"vFlip":6,"hFlip":6,"body":1242},24,"\u003Cpath fill=\"none\" stroke=\"currentColor\" stroke-linecap=\"round\" stroke-linejoin=\"round\" stroke-width=\"2\" d=\"m9 18l6-6l-6-6\"/>",{"left":4,"top":4,"width":1241,"height":1241,"rotate":4,"vFlip":6,"hFlip":6,"body":1244},"\u003Cg fill=\"none\" stroke=\"currentColor\" stroke-linecap=\"round\" stroke-linejoin=\"round\" stroke-width=\"2\">\u003Cpath d=\"M12.586 2.586A2 2 0 0 0 11.172 2H4a2 2 0 0 0-2 2v7.172a2 2 0 0 0 .586 1.414l8.704 8.704a2.426 2.426 0 0 0 3.42 0l6.58-6.58a2.426 2.426 0 0 0 0-3.42z\"/>\u003Ccircle cx=\"7.5\" cy=\"7.5\" r=\".5\" fill=\"currentColor\"/>\u003C/g>",1778179494083]