[{"data":1,"prerenderedAt":1103},["ShallowReactive",2],{"i-kinnu:logo":3,"i-kinnu:origami-folding":8,"pathway-science-statistics-for-data-science-intermediate-level":12,"i-lucide:chevron-right":1098,"i-lucide:tag":1101},{"left":4,"top":4,"width":5,"height":5,"rotate":4,"vFlip":6,"hFlip":6,"body":7},0,27,false,"\u003Cg fill=\"none\">\u003Cpath d=\"M0.046875 1.05555C0.046875 1.03541 0.048197 1.01579 0.0507438 0.996728C0.0987149 0.438619 0.586845 0 1.18194 0H25.4398C26.451 0 26.9575 1.171 26.2424 1.85585L15.7301 11.9243L1.31574 0.903476C1.17475 0.79568 1.01137 0.761884 0.859586 0.784111L26.2936 25.1441C27.0086 25.829 26.5022 27 25.4909 27H1.18194C0.555061 27 0.046875 26.5133 0.046875 25.9129V1.05555Z\" fill=\"currentColor\"/>\u003C/g>",{"left":4,"top":4,"width":9,"height":10,"rotate":4,"vFlip":6,"hFlip":6,"body":11},1000,236,"\u003Cg fill=\"none\">\u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M193.68 38.2238C195.994 38.2238 197.87 40.0989 197.87 42.412V231.812C197.87 234.125 195.994 236 193.68 236H4.19013C1.87603 236 2.02305e-07 234.125 0 231.812V42.412C-2.02305e-07 40.0989 1.87603 38.2238 4.19013 38.2238H193.68ZM111.76 89.0072C111.685 87.9474 110.572 87.2905 109.608 87.7376L96.8872 93.641C95.7786 94.1554 95.702 95.7016 96.7545 96.3225L101.579 99.167C94.7045 109.365 90.5733 122.892 90.5732 137.642C90.5733 154.323 95.8569 169.439 104.416 179.945C105.301 181.032 106.9 181.196 107.987 180.311C109.075 179.426 109.238 177.828 108.353 176.741C100.621 167.25 95.6522 153.305 95.6521 137.642C95.6522 123.661 99.6138 111.051 105.963 101.754L110.456 104.403C111.508 105.024 112.826 104.21 112.74 102.991L111.76 89.0072ZM9.63194 136.286C9.14864 136.286 8.75684 136.678 8.75684 137.161C8.7569 137.644 9.14868 138.035 9.63194 138.035H17.2161C17.6993 138.035 18.0912 137.644 18.0912 137.161C18.0912 136.678 17.6994 136.286 17.2161 136.286H9.63194ZM22.6813 136.286C22.198 136.286 21.8062 136.678 21.8062 137.161C21.8063 137.644 22.1981 138.035 22.6813 138.035H30.2655C30.7487 138.035 31.1406 137.644 31.1406 137.161C31.1406 136.678 30.7488 136.286 30.2655 136.286H22.6813ZM35.7464 136.286C35.2631 136.286 34.8713 136.678 34.8713 137.161C34.8713 137.644 35.2631 138.035 35.7464 138.035H44.4973C44.9805 138.035 45.3724 137.644 45.3724 137.161C45.3724 136.678 44.9806 136.286 44.4973 136.286H35.7464ZM49.9977 136.286C49.5144 136.286 49.1226 136.678 49.1226 137.161C49.1226 137.644 49.5144 138.035 49.9977 138.035H57.5819C58.0651 138.035 58.4569 137.644 58.457 137.161C58.457 136.678 58.0651 136.286 57.5819 136.286H49.9977ZM63.0783 136.286C62.595 136.286 62.2032 136.678 62.2032 137.161C62.2033 137.644 62.5951 138.035 63.0783 138.035H70.6625C71.1457 138.035 71.5375 137.644 71.5376 137.161C71.5376 136.678 71.1457 136.286 70.6625 136.286H63.0783ZM76.1277 136.286C75.6444 136.286 75.2526 136.678 75.2526 137.161C75.2527 137.644 75.6445 138.035 76.1277 138.035H83.7119C84.1951 138.035 84.5869 137.644 84.587 137.161C84.587 136.678 84.1951 136.286 83.7119 136.286H76.1277ZM102.266 136.286C101.782 136.286 101.39 136.678 101.39 137.161C101.391 137.644 101.782 138.035 102.266 138.035H109.85C110.333 138.035 110.725 137.644 110.725 137.161C110.725 136.678 110.333 136.286 109.85 136.286H102.266ZM115.338 136.286C114.855 136.286 114.463 136.678 114.463 137.161C114.463 137.644 114.855 138.035 115.338 138.035H122.923C123.406 138.035 123.798 137.644 123.798 137.161C123.798 136.678 123.406 136.286 122.923 136.286H115.338ZM128.403 136.286C127.92 136.286 127.528 136.678 127.528 137.161C127.528 137.644 127.92 138.035 128.403 138.035H135.988C136.471 138.035 136.863 137.644 136.863 137.161C136.863 136.678 136.471 136.286 135.988 136.286H128.403ZM141.468 136.286C140.985 136.286 140.593 136.678 140.593 137.161C140.593 137.644 140.985 138.035 141.468 138.035H149.053C149.536 138.035 149.928 137.644 149.928 137.161C149.928 136.678 149.536 136.286 149.053 136.286H141.468ZM154.541 136.286C154.058 136.286 153.666 136.678 153.666 137.161C153.666 137.644 154.058 138.035 154.541 138.035H162.125C162.609 138.035 163 137.644 163.001 137.161C163.001 136.678 162.609 136.286 162.125 136.286H154.541ZM167.614 136.286C167.131 136.286 166.739 136.678 166.739 137.161C166.739 137.644 167.131 138.035 167.614 138.035H175.198C175.681 138.035 176.073 137.644 176.073 137.161C176.073 136.678 175.681 136.286 175.198 136.286H167.614ZM180.671 136.286C180.188 136.286 179.796 136.678 179.796 137.161C179.796 137.644 180.188 138.035 180.671 138.035H188.255C188.739 138.035 189.13 137.644 189.131 137.161C189.131 136.678 188.739 136.286 188.255 136.286H180.671Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M444.85 38.2277C447.164 38.2277 449.04 40.1028 449.04 42.4159V132.928C449.04 135.241 447.164 137.116 444.85 137.116H255.36C253.046 137.116 251.17 135.241 251.17 132.928V42.4159C251.17 40.1028 253.046 38.2277 255.36 38.2277H444.85ZM361.96 125.388C361.618 125.046 361.064 125.046 360.722 125.388L354.534 131.572C354.192 131.914 354.192 132.468 354.534 132.81C354.876 133.151 355.43 133.151 355.772 132.81L361.96 126.624C362.301 126.283 362.301 125.73 361.96 125.388ZM371.047 116.311C370.705 115.969 370.15 115.969 369.809 116.311L364.446 121.671C364.104 122.012 364.104 122.567 364.446 122.908C364.788 123.249 365.342 123.25 365.684 122.908L371.047 117.548C371.388 117.207 371.388 116.652 371.047 116.311ZM380.124 107.246C379.782 106.904 379.227 106.904 378.885 107.246L373.523 112.606C373.181 112.948 373.181 113.502 373.523 113.844C373.864 114.185 374.419 114.185 374.761 113.844L380.124 108.483C380.465 108.142 380.465 107.587 380.124 107.246ZM385.736 65.8841C385.891 64.6727 384.622 63.7845 383.536 64.3434L371.069 70.7636C370.124 71.2504 369.96 72.5334 370.752 73.2424L381.2 82.5938C382.11 83.4081 383.561 82.8672 383.717 81.6557L384.393 76.3725C391.143 77.1933 398.567 80.7709 404.771 86.9711C411.124 93.3213 414.726 100.952 415.43 107.827C415.573 109.221 416.819 110.236 418.214 110.093C419.609 109.95 420.624 108.703 420.481 107.309C419.644 99.1317 415.435 90.4514 408.362 83.3817C401.466 76.489 393.038 72.3185 385.038 71.338L385.736 65.8841ZM389.2 98.1733C388.859 97.8319 388.304 97.8318 387.962 98.1733L382.6 103.534C382.258 103.875 382.258 104.429 382.6 104.771C382.941 105.112 383.496 105.112 383.838 104.771L389.2 99.4108C389.542 99.0693 389.542 98.5149 389.2 98.1733ZM398.262 89.1047C397.92 88.7633 397.365 88.7632 397.024 89.1047L391.661 94.4649C391.319 94.8065 391.319 95.3608 391.661 95.7024C392.002 96.0436 392.557 96.0438 392.899 95.7024L398.262 90.3421C398.603 90.0007 398.603 89.4463 398.262 89.1047ZM416.431 70.9616C416.089 70.6202 415.534 70.6201 415.193 70.9616L409.83 76.3218C409.488 76.6634 409.488 77.2177 409.83 77.5592C410.172 77.9005 410.726 77.9007 411.068 77.5592L416.431 72.199C416.772 71.8575 416.772 71.3032 416.431 70.9616ZM425.508 61.891C425.166 61.5496 424.611 61.5495 424.27 61.891L418.907 67.2512C418.565 67.5928 418.565 68.1471 418.907 68.4887C419.249 68.8299 419.803 68.8301 420.145 68.4887L425.508 63.1284C425.849 62.787 425.849 62.2326 425.508 61.891ZM434.569 52.8146C434.227 52.4731 433.673 52.4731 433.331 52.8146L427.968 58.1748C427.626 58.5163 427.627 59.0706 427.968 59.4122C428.31 59.7534 428.864 59.7537 429.206 59.4122L434.569 54.052C434.91 53.7105 434.91 53.1562 434.569 52.8146ZM443.638 43.7479C443.296 43.4065 442.742 43.4064 442.4 43.7479L437.037 49.1081C436.695 49.4496 436.696 50.004 437.037 50.3455C437.379 50.6868 437.933 50.687 438.275 50.3455L443.638 44.9853C443.98 44.6438 443.979 44.0895 443.638 43.7479Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M684.066 38.2277C687.798 38.2281 689.667 42.7391 687.027 45.3773L596.473 135.889C595.687 136.675 594.621 137.116 593.51 137.116H506.335C504.021 137.116 502.145 135.241 502.145 132.928V42.4159C502.145 40.1028 504.021 38.2277 506.335 38.2277H684.066ZM514.603 124.566C514.261 124.224 513.707 124.224 513.365 124.566L507.178 130.751C506.836 131.093 506.836 131.646 507.178 131.988C507.519 132.329 508.073 132.329 508.415 131.988L514.603 125.803C514.945 125.462 514.945 124.908 514.603 124.566ZM523.689 115.491C523.348 115.15 522.794 115.15 522.452 115.491L517.09 120.852C516.748 121.193 516.748 121.747 517.09 122.088C517.431 122.43 517.985 122.43 518.327 122.088L523.689 116.728C524.031 116.386 524.031 115.833 523.689 115.491ZM532.102 65.8295C530.707 65.6872 529.46 66.7017 529.318 68.0957C529.175 69.4896 530.189 70.7355 531.584 70.8787C538.463 71.5825 546.096 75.1826 552.45 81.5329C558.723 87.8037 562.312 95.3226 563.079 102.13L557.738 102.392C556.518 102.452 555.865 103.855 556.607 104.827L565.115 115.969C565.76 116.814 567.051 116.751 567.611 115.847L574.992 103.928C575.635 102.889 574.848 101.555 573.628 101.615L568.161 101.882C568.161 101.878 568.162 101.874 568.161 101.871C567.324 93.6931 563.114 85.0124 556.041 77.9425C548.968 70.873 540.283 66.6668 532.102 65.8295ZM532.766 106.421C532.425 106.079 531.871 106.079 531.529 106.421L526.166 111.781C525.825 112.123 525.825 112.676 526.166 113.018C526.508 113.359 527.062 113.359 527.403 113.018L532.766 107.657C533.108 107.316 533.108 106.762 532.766 106.421ZM541.843 97.3445C541.501 97.003 540.948 97.003 540.606 97.3445L535.243 102.705C534.901 103.046 534.902 103.6 535.243 103.941C535.585 104.283 536.139 104.283 536.48 103.941L541.843 98.5809C542.185 98.2393 542.185 97.686 541.843 97.3445ZM550.92 88.2778C550.578 87.9363 550.025 87.9363 549.683 88.2778L544.32 93.638C543.978 93.9796 543.978 94.5329 544.32 94.8745C544.662 95.2161 545.215 95.2161 545.557 94.8745L550.92 89.5142C551.262 89.1727 551.262 88.6193 550.92 88.2778ZM569.066 70.1405C568.724 69.799 568.17 69.7991 567.829 70.1405L562.466 75.5008C562.124 75.8423 562.124 76.3956 562.466 76.7372C562.808 77.0788 563.361 77.0788 563.703 76.7372L569.066 71.377C569.407 71.0354 569.407 70.4821 569.066 70.1405ZM578.143 61.0699C577.801 60.7284 577.247 60.7285 576.906 61.0699L571.543 66.4302C571.201 66.7717 571.201 67.3251 571.543 67.6666C571.885 68.0082 572.438 68.0082 572.78 67.6666L578.143 62.3064C578.484 61.9648 578.484 61.4115 578.143 61.0699ZM587.219 51.9896C586.878 51.6481 586.324 51.6481 585.982 51.9896L580.62 57.3498C580.278 57.6914 580.278 58.2447 580.62 58.5863C580.961 58.9279 581.515 58.9279 581.857 58.5863L587.219 53.2261C587.561 52.8845 587.561 52.3312 587.219 51.9896ZM596.288 42.9249C595.947 42.5833 595.392 42.5833 595.05 42.9249L589.689 48.2851C589.347 48.6267 589.347 49.18 589.689 49.5216C590.03 49.863 590.584 49.8631 590.926 49.5216L596.288 44.1613C596.63 43.8198 596.63 43.2664 596.288 42.9249Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M850.814 38.2277C854.547 38.2281 856.416 42.739 853.777 45.3773L763.223 135.889C762.437 136.674 761.371 137.116 760.26 137.116H673.176C669.443 137.116 667.574 132.605 670.213 129.966L760.768 39.4544C761.554 38.6692 762.62 38.2277 763.731 38.2277H850.814ZM761.338 121.8C760.855 121.8 760.463 122.191 760.463 122.674V131.13H762.213V122.674C762.213 122.191 761.821 121.8 761.338 121.8ZM761.338 108.971C760.855 108.971 760.463 109.363 760.463 109.846V118.301H762.213V109.846C762.213 109.363 761.821 108.971 761.338 108.971ZM761.338 96.1402C760.855 96.1406 760.463 96.5321 760.463 97.0149V105.47H762.213V97.0149C762.213 96.532 761.821 96.1404 761.338 96.1402ZM782.263 71.887C781.043 71.951 780.395 73.3571 781.139 74.3257L784.474 78.6631C779.115 82.951 771.242 85.7443 762.35 85.7444C753.366 85.7442 745.421 82.8944 740.059 78.5305C738.972 77.6461 737.373 77.8099 736.488 78.8961C735.602 79.983 735.766 81.582 736.853 82.467C743.231 87.6574 752.348 90.8207 762.35 90.8209C772.209 90.8208 781.205 87.746 787.568 82.6884L790.833 86.9341C791.577 87.9025 793.103 87.6391 793.479 86.4767L797.791 73.138C798.118 72.127 797.33 71.1017 796.268 71.1566L782.263 71.887ZM761.338 70.4847C760.855 70.4851 760.463 70.8767 760.463 71.3594V79.8147H762.213V71.3594C762.213 70.8766 761.821 70.485 761.338 70.4847ZM761.338 57.656C760.855 57.6564 760.463 58.048 760.463 58.5307V66.986H762.213V58.5307C762.213 58.0479 761.821 57.6563 761.338 57.656ZM761.338 44.8293C760.855 44.8297 760.463 45.2212 760.463 45.704V54.1592H762.213V45.704C762.213 45.2211 761.821 44.8295 761.338 44.8293Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M995.759 38.2277C999.53 38.228 1001.42 42.5171 998.752 45.0253L959.55 81.9005L905.796 41.5363C905.271 41.1418 904.662 41.0182 904.096 41.0994L997.485 130.319C1000.15 132.828 998.262 137.116 994.491 137.116H905.298C902.96 137.116 901.065 135.333 901.065 133.134V42.0941C901.065 42.0204 901.07 41.9483 901.079 41.8786C901.258 39.8345 903.079 38.2277 905.298 38.2277H995.759Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M505.873 0C506.657 4.57042e-05 507.307 0.195499 507.823 0.587023C508.338 0.969046 508.596 1.53802 508.596 2.29251C508.596 2.76034 508.467 3.19015 508.209 3.58162C507.951 3.96344 507.497 4.26401 506.848 4.48361V4.54114C507.65 4.67487 508.205 4.96191 508.51 5.4012C508.816 5.83087 508.969 6.31772 508.969 6.86193C508.969 7.74056 508.672 8.41851 508.08 8.89604C507.497 9.38304 506.733 9.62731 505.787 9.62738C504.861 9.62738 504.158 9.42172 503.68 9.0111C503.212 8.60054 502.935 8.08005 502.849 7.44993L503.881 7.10571L503.924 7.24028C504.035 7.54934 504.211 7.82925 504.454 8.07986C504.731 8.36635 505.166 8.50986 505.758 8.50989C506.465 8.50989 506.943 8.32772 507.191 7.9648C507.449 7.6019 507.579 7.20078 507.579 6.7615C507.579 6.2173 507.378 5.80683 506.977 5.52992C506.585 5.25295 505.93 5.10026 505.013 5.07161V4.15402C505.901 4.12537 506.489 3.92484 506.776 3.55237C507.062 3.18009 507.206 2.82242 507.206 2.47876C507.206 1.62801 506.752 1.17539 505.845 1.12237L505.658 1.11749C505.467 1.11752 505.242 1.14605 504.985 1.2033C504.736 1.25105 504.511 1.3274 504.31 1.43245L504.081 2.56457L503.05 2.44951L503.322 0.687461C503.666 0.49653 504.068 0.33454 504.526 0.200875C504.985 0.0671945 505.434 0 505.873 0Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M905.727 2.30616L904.638 2.4066L904.466 1.26083H901.428V3.72497C901.533 3.71544 901.643 3.71034 901.757 3.71034H902.086C902.755 3.71034 903.386 3.78668 903.979 3.93949C904.58 4.09229 905.068 4.38363 905.44 4.8132C905.822 5.23335 906.014 5.84949 906.014 6.66106C906.014 7.64468 905.722 8.38068 905.14 8.86776C904.557 9.36434 903.783 9.6127 902.818 9.61275C901.91 9.61275 901.213 9.40711 900.725 8.99648C900.248 8.59544 899.96 8.08007 899.865 7.44993L900.911 7.10571C901.007 7.49723 901.203 7.8271 901.499 8.09449C901.795 8.37131 902.211 8.50985 902.746 8.50989C903.395 8.50989 903.869 8.33787 904.165 7.99405C904.461 7.65981 904.609 7.22507 904.609 6.69031C904.609 5.87861 904.337 5.3625 903.792 5.14279C903.248 4.91361 902.612 4.79958 901.886 4.79955C901.695 4.79955 901.489 4.80365 901.27 4.8132C901.059 4.82275 900.854 4.83701 900.653 4.85611L900.224 4.44071V0.143343H905.569L905.727 2.30616Z\"\n    fill=\"currentColor\" />\n  \u003Cpath fill-rule=\"evenodd\" clip-rule=\"evenodd\"\n    d=\"M765.49 6.04576H766.966L766.837 7.14862H765.49V9.48404H764.185V7.14862H759.857L759.713 6.04576L762.909 0.143343H765.49V6.04576ZM760.96 6.04576H764.185V1.26083H763.541L760.96 6.04576Z\"\n    fill=\"currentColor\" />\n  \u003Cpath d=\"M4.80573 6.47481H6.41154V7.60693H1.81068V6.47481H3.50235V1.27546H1.81068V0.143343H4.80573V6.47481Z\"\n    fill=\"currentColor\" />\n  \u003Cpath\n    d=\"M254.359 0C255.353 0 256.055 0.239186 256.466 0.716715C256.877 1.18447 257.083 1.68072 257.083 2.20573C257.083 2.85516 256.849 3.44346 256.38 3.96875C255.912 4.49397 255.348 4.96638 254.689 5.38657C254.039 5.79717 253.437 6.15968 252.883 6.47481H256.423L256.538 5.42948L257.599 5.51529L257.426 7.60693H251.407L251.292 6.58987C252.582 5.73032 253.638 4.98523 254.46 4.35489C255.281 3.71509 255.693 3.05632 255.693 2.37832C255.693 1.53787 255.166 1.11749 254.115 1.12237L254.115 1.11749C253.924 1.11754 253.695 1.14604 253.427 1.2033C253.16 1.25104 252.916 1.32238 252.697 1.41783L252.467 2.47876L251.45 2.3637L251.707 0.60165C252.118 0.401088 252.563 0.253475 253.041 0.15797C253.519 0.0529708 253.958 1.99446e-05 254.359 0Z\"\n    fill=\"currentColor\" />\u003C/g>",{"id":13,"data":14,"type":15,"maxContentLevel":29,"version":30,"tiles":31},"621bc00e-e0ea-4deb-a8e3-edd0f30d0c0e",{"type":15,"title":16,"tagline":17,"description":17,"featureImageSquare":18,"baseColor":19,"emoji":20,"shapePreference":4,"allowContentSuspension":21,"allowContentEdits":21,"editorsChoice":6,"accreditations":22,"certificatePriceLevel":27,"certificationTitle":28},8,"Statistics for Data Science: Intermediate Level","An intermediate guide to statistical analysis for data science","3042473e-1301-46c1-98f5-05b93789c052","#9A8F74","📊",true,[23],{"authority":24,"wasCpdTill":25,"previousCpdCreditMinutes":26},1,"2025-12-31T00:00:00Z",120,3,"Statistics for Data Science (Level 2)",9,6,[32,272,420,617,857],{"id":33,"data":34,"type":29,"maxContentLevel":27,"version":37,"orbs":38},"f40ea19a-1992-440e-8211-34dcbe9806c8",{"type":29,"title":35,"tagline":36},"Sampling Methods","How to find a sample from your population.",2,[39,92,141,209],{"id":40,"data":41,"type":37,"version":37,"maxContentLevel":27,"pages":43},"3757eb05-49db-452f-bac6-7af31f425abc",{"type":37,"title":42},"Types of Sampling Methods",[44,60,74],{"id":45,"data":46,"type":24,"maxContentLevel":27,"version":37,"reviews":50},"22a60c69-5e39-4949-8cd3-8d0b6ba858d3",{"type":24,"title":47,"contentRole":37,"markdownContent":48,"audioMediaId":49},"Probability sampling ","Probability sampling is sometimes also called ‘random selection’ or ‘random sampling’. ‘Probability sampling’ is just a fancy way of saying that everyone has an equal chance of being selected.\n\nWhat does that mean for you?\n\nWell, random sampling is preferable because it will generalize better. That means you can be more confident that the inferences you draw from your sample will also hold true in your population.\n\nWith random sampling, you can be more confident in the map you build of your dataset.","c4d7fc40-3438-4389-a441-c830afcd4959",[51],{"id":52,"data":53,"type":54,"version":24,"maxContentLevel":27},"d875b6a7-2b9f-4c5b-b6c0-d4d5b1e15526",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":56,"clozeWords":58},11,4,[57],"Random sampling is also known as probability sampling and it helps you build a map you can be more confident in.",[59],"probability",{"id":61,"data":62,"type":24,"maxContentLevel":27,"version":37,"reviews":66},"f89c67aa-23cb-45ef-ad40-2512186590b0",{"type":24,"title":63,"contentRole":37,"markdownContent":64,"audioMediaId":65},"Non-probability sampling","Non-probability sampling is also known as ‘non-random sampling’, which means that some people have a better chance of being selected than others. The selection process isn’t completely random.\n\nResults from samples taken using non-random sampling don’t generalize as well to the population as results from samples taken using random sampling. So you might get led astray by your sample data.\n\nYou could be directionally right but just out by a little distance – it’s like if you took the right street to get to the local pool and you just ran out of gas before you got there. Or you could be directionally wrong, in which case you ended up in a different town altogether.","fd83f3d4-199f-435a-aa5d-d75fa74ae0f6",[67],{"id":68,"data":69,"type":54,"version":24,"maxContentLevel":27},"736403b0-2042-41db-a37d-74a51da7e3e3",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":70,"activeRecallAnswers":72},[71],"What is another name for non-probability sampling?",[73],"Non-random sampling",{"id":75,"data":76,"type":24,"maxContentLevel":27,"version":37,"reviews":80},"4b509636-8bb4-4f5b-bfd6-3144828b0ea9",{"type":24,"title":77,"contentRole":37,"markdownContent":78,"audioMediaId":79},"Convenience sample ","So you’re feeling lazy. A convenience sample is a nonrandom sample that is mainly valuable because of how easy it is to take.\n\n![Graph](image://bc1ac58c-4c17-408e-96cf-34ec29ad4c8e \"Surveys are often a means of convenience sampling\")\n\nIf you’ve been in a mall, sometimes you might see people there taking surveys. By doing this, they are using convenience sampling. They set up a booth, or stand around waiting for passers-by, and they approach people at random to collect data and opinions.\n\nBut, just because they’re approaching people randomly, don’t confuse convenience sampling with random sampling.\n\nDue to the fact that not everybody has an equal probability of being in the mall on that exact day, not everybody has a chance of being selected. Some people were at home that day, right? That means they weren’t in the mall and couldn’t be part of the study.\n\nSo, when you’re sampling based on who is easiest to access, you are using convenience sampling.","37cac221-ba45-43e7-881d-eab067d0007b",[81],{"id":82,"data":83,"type":54,"version":24,"maxContentLevel":27},"ca6a58aa-f848-4b57-ac8b-f1f3edd7a02c",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":84,"multiChoiceCorrect":86,"multiChoiceIncorrect":88},[85],"What type of sampling is used when selecting participants based on who is easiest to access?",[87],"Convenience sampling",[89,90,91],"Random sampling","Stratified sampling","Cluster sampling",{"id":93,"data":94,"type":37,"version":24,"maxContentLevel":27,"pages":96},"578b9ee6-a0ad-470d-a1ea-12c234169a3a",{"type":37,"title":95},"Challenges in Non-Random Sampling",[97,111,125],{"id":98,"data":99,"type":24,"maxContentLevel":27,"version":24,"reviews":103},"36183c0a-f9de-4d4a-a44b-99fe9f036cb9",{"type":24,"title":100,"contentRole":37,"markdownContent":101,"audioMediaId":102},"Complications with convenience sampling ","\nThere are some complications with convenience sampling. Consider for example, if you set up a booth at the local swimming pool to gauge how many residents use the swimming facilities – what could go wrong? \n\nObviously, it’s great that you thought to find people who have experience with the city’s swimming facilities.\n\n ![Graph](image://4bcbb19d-f0bf-4870-b758-c012e62daead \"People at the swimming pool will probably have some bias about the swimming pool\")\n\nBut… doesn’t it all seem just a little bit too… convenient? \n\nConsider for example the fact that people who enjoy the facilities will logically keep going to the swimming pool. But, people who don’t enjoy the facilities are more likely to stop going. Therefore, you'll have some bias in your sample.\n\nConvenience sampling like this has low external validity, which is a fancy way of saying that you can’t infer a lot of things about the opinions of everyone in your town – which is your population of interest – if you only sampled people at the pool because your sample isn’t representative of the entire population.\n","68bbaf2d-179e-464d-a67c-5afb13e6d4cd",[104],{"id":105,"data":106,"type":54,"version":24,"maxContentLevel":27},"77e226ad-f610-46e9-afe9-a1a64eb6640b",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":107,"activeRecallAnswers":109},[108],"What is the term for the issue of a sample not being representative of the entire population?",[110],"Low external validity",{"id":112,"data":113,"type":24,"maxContentLevel":27,"version":24,"reviews":117},"d98c718a-c6db-4e9b-88eb-9ccdd57ec8df",{"type":24,"title":114,"contentRole":37,"markdownContent":115,"audioMediaId":116},"Voluntary response sampling ","\nVoluntary response sampling means constructing a sample based on who volunteers to be a part of your study. It is another form of non-random sampling. \n\nSometimes not everyone understands why voluntary response sampling is non-random. After all, you don’t choose the people intentionally, and they kind of just volunteer at random, right? Well, voluntary response sampling is not random because the characteristics and motivations that inspire people to volunteer for your sample introduce bias.\n\n\n ![Graph](image://3f26a92d-7b86-49c4-9afe-43da919a4cc2 \"People with very strong views on topics are more likely to come forward to share their views\")\n\nImagine if you asked people to volunteer their opinions on the old tree in the field being cut down to make way for new housing developments. Both environmental activists and passionate citizens would be very motivated to volunteer their opinions. Property developers too would perhaps participate. But other citizens who didn’t hold very strong opinions would be far less likely to volunteer.\n\n","c7444011-da04-4606-9c56-bc1ff6756264",[118],{"id":119,"data":120,"type":54,"version":24,"maxContentLevel":27},"b5bc6f46-4fa2-4611-b5f1-dc53c2f75ccb",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":121,"clozeWords":123},[122],"Voluntary response sampling is non-random because it introduces bias due to people's motivation to volunteer.",[124],"bias",{"id":126,"data":127,"type":24,"maxContentLevel":27,"version":24,"reviews":131},"325f045e-bf99-423e-8bda-113fd001e2e1",{"type":24,"title":128,"contentRole":37,"markdownContent":129,"audioMediaId":130},"Random sampling: Simple random sample ","\nIt’s simple, it’s random, and it’s a sample. It’s random sampling. What more could you want? \n\t\nBasically, in simple random sampling, every member in your population – be it everyone in your company, or all hermit crabs on the beach – has exactly the same chance of being selected for your sample. One of the ways you can do this practically is by assigning everyone a number and using a random number generator to choose your participants.\n\n ![Graph](image://412702a0-1690-418d-bbb4-fe2733e9a157 \"Hermit crabs\")\n\nAs an example, consider that there are 1000 people in your company, you assign everyone a number, and use a random number generator to generate 100 numbers – anyone with those numbers is now part of your sample.\n\nSimple random sampling is super fair, super random, and super simple. It ensures that everyone has exactly the same chance of being a part of your cool research study.\n\n","67fcbb6b-13bf-4b5a-a0ce-20a3ba84b3da",[132],{"id":133,"data":134,"type":54,"version":24,"maxContentLevel":27},"8dce5e4f-3005-4bd3-85ac-63c14b751052",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":135,"multiChoiceCorrect":137,"multiChoiceIncorrect":139},[136],"How can a researcher ensure that everyone in a population has an equal chance of being selected for a sample?",[138],"Simple random sampling",[90,140,91],"Systematic sampling",{"id":142,"data":143,"type":37,"version":24,"maxContentLevel":27,"pages":145},"4f0e577f-6643-423d-91a0-7c55b4d833b6",{"type":37,"title":144},"Random Sampling Techniques",[146,162,175,191],{"id":147,"data":148,"type":24,"maxContentLevel":27,"version":24,"reviews":152},"3f2e0d79-333b-49bd-bf4d-ff3aa9ca329b",{"type":24,"title":149,"contentRole":37,"markdownContent":150,"audioMediaId":151},"Random sampling: Systematic sampling ","\nYou might already be familiar with systematic sampling, in fact we are often introduced to it early, in school. \n\nHow does systematic sampling work? Did you ever have to choose teams for school sports, and number everyone 1 or 2? Well that’s how systematic sampling works. As an example, if you number every second person ‘2’ and you say ‘two’s are part of this sample’.\n\n\n ![Graph](image://2f8aae73-dace-42dd-9da1-ded6a063e491 \"Number 2\")\n\nIt doesn’t have to be every second person either, it could be every 15th, anything you choose. \n\n","8efea2df-bfd9-4dd0-860f-6cb94d198a40",[153],{"id":154,"data":155,"type":54,"version":24,"maxContentLevel":27},"81caeb7f-d5c7-41a3-b704-05cda9d193a1",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":156,"binaryCorrect":158,"binaryIncorrect":160},[157],"How does systematic sampling work?",[159],"By numbering every second (or any chosen) person.",[161],"By randomly selecting people for the sample.",{"id":163,"data":164,"type":24,"maxContentLevel":27,"version":24,"reviews":168},"eb7d13b4-ca7a-4bec-ba6e-076afeaf18a5",{"type":24,"title":165,"contentRole":37,"markdownContent":166,"audioMediaId":167},"Problems with systematic sampling and patterns in your data","\nSystematic sampling is typically a cheaper method than simple random sampling, but you need to make sure there are no patterns lurking in your population or data. \n\n ![Graph](image://0583ca4f-24b2-4a81-a49e-bb6dea13c85b \"Patterns can be hidden within systematic sampling\")\n\nAs an example, imagine if you had two classes in school and you told everyone to find a partner with someone from the other class, and then you said everybody in class A is number 1, and everybody in class B is number 2. You would inadvertantly bias your sample, because there is a pattern behind the 2's.\n\nIn this case you had a pattern in your data and nobody in class A would get to be part of your sample. That causes ‘sampling bias’ which means our results won’t generalize to the population. And when conducting research, we typically want our results to generalize. \n\n","b4d9f86f-d120-474d-b37d-c29b51575075",[169],{"id":170,"data":171,"type":54,"version":24,"maxContentLevel":27},"7ec038a9-8bb4-48f1-9252-ba1730501ee8",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":172,"activeRecallAnswers":174},[173],"What type of sampling is typically cheaper than simple random sampling?",[140],{"id":176,"data":177,"type":24,"maxContentLevel":27,"version":24,"reviews":181},"159aa9f4-533b-4327-bb3e-d021c88596f0",{"type":24,"title":178,"contentRole":37,"markdownContent":179,"audioMediaId":180},"Random sampling: Stratified sampling ","Stratified sampling involves dividing your population into groups called ‘strata’, and then sampling those strata using another random sampling method, like simple random sampling. It actually makes a lot of sense, even though it might seem a little more complicated than other sampling methods at first. \n\n ![Graph](image://9f6f70b5-6dd1-4919-97f2-d67fcf0b6bc8 \"Stratified sampling\")\n\nStratified sampling is used when you want to ensure that every group, or strata, is properly represented in your sample – rather than it being completely random with something like simple random sampling.\n\n","c6bc79b3-0052-49f3-b46a-947eac98cac6",[182],{"id":183,"data":184,"type":54,"version":24,"maxContentLevel":27},"fc030c8c-2eb1-4fdb-835a-774d645917ff",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":185,"binaryCorrect":187,"binaryIncorrect":189},[186],"What is the purpose of stratified sampling?",[188],"To ensure that every group is properly represented in the sample",[190],"To ensure that the sample is completely random",{"id":192,"data":193,"type":24,"maxContentLevel":27,"version":24,"reviews":197},"069e4f83-f72f-4d13-9bda-f9d60647cbb9",{"type":24,"title":194,"contentRole":37,"markdownContent":195,"audioMediaId":196},"An example of stratified sampling","\nStratified sampling involves first dividing your population into strata, and then using random sampling on that strata. \n\n ![Graph](image://7a56e8ff-8f1b-4fb3-bf48-024acfae62f3 \"Engineers discuss plans\")\n\nAs an example, let’s say that there are 8000 engineers in a company and 2000 office staff. You want to select a sample of 1000 people to find out what the company as a whole is like. To make sure your sample reflects the company perfectly, you separate your company into two groups, engineers and office staff. \n\nThen within each group, you use a random sampling method such as simple random sampling to select exactly 200 office staff, and 800 engineers. That way you get a sample that is proportionate to your broader population. \n\nThere could be two groups, otherwise called stratas, like in the engineers and office workers example, or there could be 20 groups. Nobody is limiting you! \n\n","c116f976-c590-4439-b353-eaf8cd4a9980",[198],{"id":199,"data":200,"type":54,"version":24,"maxContentLevel":27},"52459add-6d96-454b-967a-0bd283c34eae",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":201,"multiChoiceCorrect":203,"multiChoiceIncorrect":205},[202],"How many groups can be used when employing stratified sampling?",[204],"Any number",[206,207,208],"Two","Five","Ten",{"id":210,"data":211,"type":37,"version":24,"maxContentLevel":27,"pages":213},"33156ecb-cd31-4923-ab02-03baa478b008",{"type":37,"title":212},"Cluster and Stratified Sampling",[214,228,242,258],{"id":215,"data":216,"type":24,"maxContentLevel":27,"version":24,"reviews":220},"7c10432f-525b-445a-95be-ded0e0791ad8",{"type":24,"title":217,"contentRole":37,"markdownContent":218,"audioMediaId":219},"Random sampling: Cluster sampling ","Cluster sampling is when you divide your population into clusters, and then select only some of those clusters at random. It is a probability sampling method, otherwise known as a random sampling method. \n\n ![Graph](image://13f02f9f-edb5-44cc-8284-34782dcb8ecd \"Cluster sampling\")\n\nIn cluster sampling, if a group is selected, then all of the members of that group will be included in the study. Members of the groups not selected at random will not be included in the study. \n","c56e1d4a-309e-4557-b496-87a5557e22da",[221],{"id":222,"data":223,"type":54,"version":24,"maxContentLevel":27},"6e51ff52-b0cd-4b16-8d88-85597451e705",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":224,"activeRecallAnswers":226},[225],"What type of sampling method does cluster sampling belong to?",[227],"Probability sampling",{"id":229,"data":230,"type":24,"maxContentLevel":27,"version":24,"reviews":234},"4250f335-122e-44a0-a843-8d9dfb32a0a8",{"type":24,"title":231,"contentRole":37,"markdownContent":232,"audioMediaId":233},"An example of cluster sampling ","\nAs an example of cluster sampling, imagine you work for a huge company and you’re the big boss with the corner office. One day, you’ve decided that you want to survey all of your company's offices, of which there are 100 all across the country. Importantly, all offices are pretty similar. They have approximately the same amount of people, within the same number of roles. \n\n![Graph](image://658ebe23-02b2-4b06-9540-5d5799f5e943 \"An office building\")\n\nYou couldn’t possibly travel to every office to collect all the data you need. So you task your statisticians with coming up with a solution. They come back to you and say ‘we can use random sampling to select 30 offices, which we would label as 30 clusters, instead of sampling every single office, it’s much less work’. It’s important to note that in cluster sampling all your clusters should be similar – so if each office was for different departments, like engineering, sales, etc. then it wouldn’t work. \n\nFrom here you could include everyone from the 30 offices in your cluster. Or, you could then cluster people again based on another characteristic – that is called multistage sampling because you sample in multiple stages. \n\n","f5a9966d-4cd6-4fb7-9ebf-e9d87c15de94",[235],{"id":236,"data":237,"type":54,"version":24,"maxContentLevel":27},"a27b220b-2468-414f-ba8f-f9fe2dd83f0e",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":238,"multiChoiceCorrect":240,"multiChoiceIncorrect":241},[239],"What is the name of the sampling method used to select 30 offices instead of sampling every single office?",[91],[90,140,87],{"id":243,"data":244,"type":24,"maxContentLevel":27,"version":24,"reviews":248},"11913e66-d50f-4cb9-bdd1-068052ea7da6",{"type":24,"title":245,"contentRole":37,"markdownContent":246,"audioMediaId":247},"How do stratified sampling and cluster sampling differ? ","Cluster sampling first divides the population into groups, then randomly selects a number of groups, and then includes all the members of those randomly chosen groups in the study. So in cluster sampling, your group is not guaranteed to be part of the study, but if your group is randomly selected then you will definitely be a part of the study. \n\n![Graph](image://88402c3b-1e46-44c8-aca2-b5dc56870c01 \"Stratified random sampling vs. cluster sampling\")\n\nStratified sampling on the other hand first divides a population into groups, and then randomly selects some members from all of the created groups. In stratified sampling, your group is guaranteed to be part of the study, but not all members in your group will be.\n\n","dd15161b-1c9a-41e3-ae5c-c7b6135040b3",[249],{"id":250,"data":251,"type":54,"version":24,"maxContentLevel":27},"dd2ab9f4-2737-476f-8505-5a039597cbf8",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":252,"binaryCorrect":254,"binaryIncorrect":256},[253],"In cluster sampling, what is the likelihood of an individual being part of the study?",[255],"Likely if the group is randomly selected",[257],"Guaranteed to be part of the study",{"id":259,"data":260,"type":24,"maxContentLevel":27,"version":24,"reviews":264},"40defa3e-0153-48ef-b01c-addcc9799d9a",{"type":24,"title":261,"contentRole":37,"markdownContent":262,"audioMediaId":263},"When to use stratified sampling and cluster sampling ","\nCluster sampling first divides the population into groups, then randomly selects a number of groups, and then includes all the members of those randomly chosen groups in the study. You should use it if you expect that all your clusters are homogenous, which means they are the same. Like different office locations of a data science company. \n\n ![Graph](image://dd140e65-f472-463f-98c9-ed308c7bb49d \"A bar chart showing stratified sampling\")\n\nStratified sampling on the other hand first divides a population into groups, and then randomly selects some members from all of the created groups. You should use stratified sampling when you expect that your groups are heterogeneous, which means they are different. For example, separating the maths majors from the humanities majors at university. \n\nYou can remember the difference between these Cluster vs Stratified sampling through the phrase \"All from some, and some from all\".","3fca04db-d14f-4e27-a9b4-43e7b8780b9a",[265],{"id":266,"data":267,"type":54,"version":24,"maxContentLevel":27},"27eaaa73-b9df-45b2-af06-fc1f7e2c840b",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":268,"activeRecallAnswers":270},[269],"How is cluster sampling different from stratified sampling?",[271],"Cluster sampling randomly selects a number of groups, while stratified sampling randomly selects some members from all groups",{"id":273,"data":274,"type":29,"maxContentLevel":27,"version":24,"orbs":277},"dfad07eb-f207-4410-b5c9-ee1cfa4fad28",{"type":29,"title":275,"tagline":276},"Correlations Between Variables","The different types of correlation, and how to identify them.",[278,332,365],{"id":279,"data":280,"type":37,"version":24,"maxContentLevel":27,"pages":282},"b19aafc0-3995-44a1-8b90-21476cf27d4f",{"type":37,"title":281},"Understanding Correlation",[283,296,314],{"id":284,"data":285,"type":24,"maxContentLevel":27,"version":24,"reviews":289},"3a9af20c-e2e8-4853-ad7a-382b5f8f1344",{"type":24,"title":286,"contentRole":37,"markdownContent":287,"audioMediaId":288},"Correlation","\nCorrelation measures the relationship between two variables. More precisely, it calculates the level of change that you can expect to see in one variable relative to a change in another variable. \n\nImagine a scatter graph where your independent variable is “time spent studying” – your dependent variable is “number of questions answered correctly”.\n\nDo you think that the amount of time you spend studying is related to the number of questions you answer correctly in Kinnu? It probably is! \n\nThat means that ‘the amount of time spent studying is correlated with the number of questions answered correctly’. The more you study, the more questions you answer correctly. \n\nThe more X you have, the more Y you also have. This is a correlation.\n","aa4ee9d5-2803-479d-b25e-15fb6091fc56",[290],{"id":291,"data":292,"type":54,"version":24,"maxContentLevel":27},"41d9d728-855e-4c38-8909-bad53dc4fb12",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":293,"activeRecallAnswers":295},[294],"What term describes the relationship between two variables?",[286],{"id":297,"data":298,"type":24,"maxContentLevel":27,"version":24,"reviews":302},"2b79a4df-7b8f-45aa-ae38-65f978638c2d",{"type":24,"title":299,"contentRole":37,"markdownContent":300,"audioMediaId":301},"Positive correlation"," ![Graph](image://830b2f43-1e59-4bde-8e89-e3f347a39dbe \"Positive correlation (left)\")\n\n\nIf as one value gets higher, the other one does too, then you have a positive correlation. An example of a positive correlation is height and weight. As people get taller, they also tend to weigh more. \n\nBut it works both ways, because as one value gets lower, and the other one does too, that is also a positive correlation. So, a positive correlation is when both variables move in the same direction. \n\nOn a scatter plot, a positive correlation slopes up and to the right. \n\n","ff79f64b-b46a-4548-a4de-0e538a521c2f",[303],{"id":304,"data":305,"type":54,"version":24,"maxContentLevel":27},"684f09be-41aa-4d80-8bef-2ba5c48a6db8",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":306,"multiChoiceCorrect":308,"multiChoiceIncorrect":310},[307],"On a scatter plot, what does a positive correlation look like?",[309],"Slopes up and to the right",[311,312,313],"Slopes up and to the left","Slopes down and to the right","A bell curve",{"id":315,"data":316,"type":24,"maxContentLevel":27,"version":24,"reviews":320},"61d10868-f648-4f66-b10f-8bddfc4aeb91",{"type":24,"title":317,"contentRole":37,"markdownContent":318,"audioMediaId":319},"Negative and no correlation"," ![Graph](image://830b2f43-1e59-4bde-8e89-e3f347a39dbe \"Negative correlation (right)\")\n\n\nA negative correlation is when as one value moves in one direction, the other moves in the opposite direction. As an example, if one gets higher, the other gets lower. Like when you climb up a mountain and get higher above sea level, the temperature gets lower. \n\nOn a scatter plot, a negative correlation slopes down and to the right. \n\n\n\n ![Graph](image://f9cf9fbf-612d-4bb5-8c23-6250f799b989 \"No correlation\")\n\nWhat does no correlation look like? When there is no correlation between variables, a scatter plot looks like somebody has just randomly thrown darts at it. There is no real pattern to be seen in the data. This shows that your data is not correlated. For example, there is no correlation between the amount of tea you drink and how long my commute is.\n\n","e581217b-736e-4641-8386-b0c9241ef3b8",[321],{"id":322,"data":323,"type":54,"version":24,"maxContentLevel":27},"d44acadd-5377-4444-9141-113ef8e3aa00",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":324,"multiChoiceCorrect":326,"multiChoiceIncorrect":328},[325],"What does a scatter plot look like when there is no correlation between variables?",[327],"Randomly thrown darts",[329,330,331],"Sloping down and to the right","Sloping up and to the right","Sloping up and to the left",{"id":333,"data":334,"type":37,"version":24,"maxContentLevel":27,"pages":336},"e6b4d7ac-2439-4dc0-888b-63090d7f9980",{"type":37,"title":335},"Visualizing Correlation",[337,351],{"id":338,"data":339,"type":24,"maxContentLevel":27,"version":24,"reviews":343},"46af8db0-ad17-4a42-bc09-6f43d0363d35",{"type":24,"title":340,"contentRole":37,"markdownContent":341,"audioMediaId":342},"The line of best fit in scatterplots","Often when viewing scatterplots, you will see a line that passes roughly through the middle of the mass of data points. This is called the line of best fit, and represents a linear estimate of the dependent variable based on the value of the independent variable. It enables a visualization of the general trend in your data. \n\n ![Graph](image://d0d8b3bb-9128-47a6-94ff-fbe598501a84 \"Lines of best fit on the left and right\")\n\nWhen data is so tightly clustered together as it is in the image above, it’s relatively easy to visualize the general trend in your data without the line of best fit. However, in cases where your data is messier, it serves as a useful visualization tool. It is also used in predictive statistical models to mathematically – specifically algebraically – represent the relationships between variables. \n\n\n","bfa91564-d16a-4b89-bcb4-97b55cca1aaf",[344],{"id":345,"data":346,"type":54,"version":24,"maxContentLevel":27},"caf1227b-a3ae-40a6-83ad-751c5d723753",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":347,"clozeWords":349},[348],"The line of best fit is used to visualize the general trend in data and to mathematically represent the correlations between variables.",[350],"line of best fit",{"id":352,"data":353,"type":24,"maxContentLevel":27,"version":24,"reviews":357},"5bfd5abe-930a-4ac5-a111-9bdb1a4c769a",{"type":24,"title":354,"contentRole":37,"markdownContent":355,"audioMediaId":356},"Does the slope matter when visualizing correlation?","When you look at a scatter plot, you will be able to visualize the strength of a correlation. Often on a scatter plot, you will also see a line of best fit, that’s the line that runs through the middle of the data points. \n\n\n ![Graph](image://938408db-08fe-41a0-a441-5f615a8b2ba8 \"Two graphs with the same strength of correlation\")\n\n\nWhen visualising a correlation, the steepness of this line does not affect the strength of the correlation. What affects the strength of a correlation is how closely related the data points are to one another, which represents how reliably a certain change in one variable predicts a change in the other.  \n","6f84d0da-ac0f-42a7-91ef-967760a26b64",[358],{"id":359,"data":360,"type":54,"version":24,"maxContentLevel":27},"eea7d088-6373-47bb-96ff-af854be5a602",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":361,"activeRecallAnswers":363},[362],"How does the steepness of a line of best fit on a scatter plot affect the strength of a correlation?",[364],"It does not affect the strength of the correlation",{"id":366,"data":367,"type":37,"version":24,"maxContentLevel":27,"pages":369},"ea2ee01e-c0e6-43d0-9aa4-2c158a470b01",{"type":37,"title":368},"Quantifying Correlation",[370,388,406],{"id":371,"data":372,"type":24,"maxContentLevel":27,"version":24,"reviews":376},"31ce6c49-e049-463c-9150-cb5f3bed553a",{"type":24,"title":373,"contentRole":37,"markdownContent":374,"audioMediaId":375},"Correlation coefficient ","\nPearson’s Correlation Coefficient, otherwise known as Pearson’s r, is a common way to calculate the correlation between two quantitative variables. It was developed by Karl Pearson in the 1880s. \n\nThis coefficient uses a formula to calculate the relationship between variables. The resulting value will range between -1 and 1. A positive value indicates a positive relationship, a negative value indicates a negative relationship, and a value of 0 suggests no  relationship. The closer the absolute value of r is to 1, the stronger the linear relationship.\n\n\n ![Graph](image://95703044-5a63-47ec-ad30-1a46ffbb7c1c \"Pearson's Correlation Coefficient can be used to calculate r in these graphs\")\n\n\nThe Pearson Correlation Coefficient tells you in which direction two variables are correlated, as well as the strength of that correlation. This makes it a key tool for data scientists to quantify the strength of a coefficient, instead of guessing based on visual representations. \n\n","267f1cb0-a13d-41cd-a541-07912d51924c",[377],{"id":378,"data":379,"type":54,"version":24,"maxContentLevel":27},"ba3fbf30-3be5-4bcf-bc82-e31177ca4547",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":380,"multiChoiceCorrect":382,"multiChoiceIncorrect":384},[381],"What is the name of the coefficient for quantifying the strength of a correlation between two variables?",[383],"Pearson’s Correlation Coefficient",[385,386,387],"Spackman’s Correlation Coefficient","Kendall’s Correlation Coefficient","Chi-Square Correlation Coefficient",{"id":389,"data":390,"type":24,"maxContentLevel":27,"version":24,"reviews":394},"c16a9752-5900-4cf3-829b-9a56b937bf72",{"type":24,"title":391,"contentRole":37,"markdownContent":392,"audioMediaId":393},"Correlation coefficient interpretation","\n ![Graph](image://a75ab998-c475-4af2-8fd7-47a6bccdd97e \"p = Pearson's Coefficient\")\n\nPearsons’s correlation coefficient ranges from -1 to +1. \n\nA correlation coefficient of less than 0 signifies a negative correlation, while greater than 0 signifies a positive correlation. \n\nBut, the strength of a correlation is also important. The table below shows you how to define the strength of your correlation. \n\n\n ![Graph](image://da60a3de-5249-4c7d-bb6a-d81324c8740e \"The different strengths of Pearson's coefficient\")\n\n","fc39a224-1336-4701-b254-e648c482c501",[395],{"id":396,"data":397,"type":54,"version":24,"maxContentLevel":27},"a18b8dd0-a059-41b7-84f3-1251d2692c57",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":398,"multiChoiceCorrect":400,"multiChoiceIncorrect":402},[399],"What is the range of Pearson's correlation coefficient?",[401],"-1 to +1",[403,404,405],"-2 to +2","0 to +2","-1 to +2",{"id":407,"data":408,"type":24,"maxContentLevel":27,"version":24,"reviews":412},"c8fc5438-bc48-4bbe-8bc5-27c6f202859d",{"type":24,"title":409,"contentRole":37,"markdownContent":410,"audioMediaId":411},"Correlation is not causation ","\nIf you calculated the correlation coefficient – the strength of relationship – for your two continuous variables and saw that the more people studied, the better test scores they got, you could say that there was a correlation between time studying and test scores. \n\nHowever, you can’t ever say that one caused the other from the correlation coefficient alone. This is true no matter how intuitive or obvious it might seem. \n\n‘But of course studying causes better test scores’ you say. \n\nWhat if I told you that per capita cheese consumption was correlated with the number of people who died by getting tangled in their bedsheets? Would you be so sure that cheese causes this? \n\n ![Graph](image://c64c51a8-38ed-46de-93bc-3c7ca470a759 \"In this instance, it's unlikely that correlation means causation\")\n\nWhat about the fact that the number of films Nicholas Cage appears in is correlated with the number of people who drown in a pool? Would you tell us that Nicholas Cage films cause drownings? \n\nCorrelation is not causation.\n\n","df0d74f4-756b-49a3-80fa-19fd4cada7af",[413],{"id":414,"data":415,"type":54,"version":24,"maxContentLevel":27},"ed31ca59-c4c9-4844-ae73-94a3459fbe79",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":416,"clozeWords":418},[417],"Causation can never be inferred from correlation alone.",[419],"correlation",{"id":421,"data":422,"type":29,"maxContentLevel":27,"version":24,"orbs":425},"a63458b1-5735-4fce-92b3-56c6c3091ac2",{"type":29,"title":423,"tagline":424},"Problems with Data ","Working with data can be difficult – avoid common data traps in your analysis.",[426,495,580],{"id":427,"data":428,"type":37,"version":24,"maxContentLevel":27,"pages":430},"317e444f-b3f7-4b48-86f3-86eac39d43ee",{"type":37,"title":429},"Understanding Missing Data",[431,445,461,477],{"id":432,"data":433,"type":24,"maxContentLevel":27,"version":24,"reviews":437},"6d380f2b-482f-4ad0-a65e-83912f8e9dd4",{"type":24,"title":434,"contentRole":37,"markdownContent":435,"audioMediaId":436},"What is missing data? ","\nMissing data is, well, exactly as it sounds. It’s data that should be there, but isn’t! In data science, for one observation, you might have many many different variables, be they continuous, count, nominal, or ordinal.\n\n\n ![Graph](image://eda810f6-660a-400c-bd05-d1510bd284cc \"Curves showing sample results with different amounts of missing data\")\n\nTake any dataset, like a website traffic report for example, and you will find it full of data of various types. As an example, think about ‘device type’ which is a nominal variable, or ‘pages viewed’ which is count data, or time on a page which is a continuous measurement.\n\nIt’s all well and good if your dataset has all the data points you need, but what do you do when it is missing? It’s an important question because often for an observation you will have data for one variable, but not another. Maybe you have somebody's height but not their weight. Or perhaps one of the people you sampled refused to answer your survey questions.\n\nIn general, you either remove missing data or you impute it, which means you replace it with a value like the mean, median, or mode for that variable. \n\nIn practice, it actually gets very nuanced and complicated. You wouldn’t believe how much trouble pesky missing data can cause. \n\nFor now, just remember, that the two most common ways of dealing with missing data are removal and imputation. \n\n","af48b9cc-5e3e-4738-a0a3-1bb73b070542",[438],{"id":439,"data":440,"type":54,"version":24,"maxContentLevel":27},"dca16d64-80ed-44b0-bff8-bc55466d272b",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":441,"clozeWords":443},[442],"Missing data can be dealt with by removing or imputing it.",[444],"imputing",{"id":446,"data":447,"type":24,"maxContentLevel":27,"version":24,"reviews":451},"2235ff4d-a87e-4a02-a78a-3eb390860524",{"type":24,"title":448,"contentRole":37,"markdownContent":449,"audioMediaId":450},"Missing Completely at Random Data ","\nMissing Completely at Random (MCAR) data is when there is no pattern in your missing data. MCAR data seems unrelated to any observed or unobserved factor. \n\nMissing Completely at Random data is pretty rare in reality, but an example might be if data just somehow got lost and couldn’t make it to your final dataset, then that would be a case of MCAR data. Perhaps your colleague accidentally lost the only USB with the file at the restaurant after work. \n\nGenerally, if your data is MCAR then you don’t need to use methods like removal or deletion of data, or imputation of values to clean up your dataset. You can proceed with your analysis as you please. \n","13a098e0-b02f-43a2-b1b3-31af85c137fb",[452],{"id":453,"data":454,"type":54,"version":24,"maxContentLevel":27},"f22fce8e-074b-4564-930c-0de8426b9dbe",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":455,"binaryCorrect":457,"binaryIncorrect":459},[456],"What is an example of Missing Completely at Random data?",[458],"Data that got lost and couldn't make it to the final dataset",[460],"Data that is related to an observed or unobserved factor",{"id":462,"data":463,"type":24,"maxContentLevel":27,"version":24,"reviews":467},"1e2d6b4d-f00f-42e9-be15-81c4284683c7",{"type":24,"title":464,"contentRole":37,"markdownContent":465,"audioMediaId":466},"Missing Not at Random Data","\nMissing Not at Random ‘MNAR’ occurs when there is a pattern behind the data you are missing and it is related to the very data that you are missing. Consider, for example, if you were doing a housing survey and low-income households were less likely to report their income. Your survey would be biased.\n\nThis is no small problem. In fact, some estimates say around 10-15% of income data is missing from surveys because people don’t answer it. \n\nMissing Not at Random data can present problems because your sample may not be representative of your population. If you have Missing Not at Random data in your dataset, then methods like removing data or imputing values might need to be used.\n","6c95abab-1fc7-4b6e-a130-aa55d2b4267e",[468],{"id":469,"data":470,"type":54,"version":24,"maxContentLevel":27},"8c12b28c-4286-4e02-8e22-ab3fccc6ea0c",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":471,"binaryCorrect":473,"binaryIncorrect":475},[472],"In income surveys, what percentage of data is estimated to be missing because people deliberately don't answer?",[474],"10-15%",[476],"5-10%",{"id":478,"data":479,"type":24,"maxContentLevel":27,"version":24,"reviews":483},"5bae932c-6b85-4a24-b66f-4e230fb7c8aa",{"type":24,"title":480,"contentRole":37,"markdownContent":481,"audioMediaId":482},"Missing at Random Data ","\nMissing at Random (MAR) data is when there is a pattern or a cause behind your missing data, but it’s not directly related to the variable you’re missing data for. \n\nAs an example, young people might fail to answer questions related to their income on surveys. But this is not necessarily because they're ‘low income’, but because, as a cohort, they value their privacy and are less likely to discuss these things. \n\nAlternatively, men might not answer income data because they simply don’t like to. So your data is reliably missing based on another value in your dataset, like age or gender. \n","76af1504-d531-4252-abaa-df17392b2078",[484],{"id":485,"data":486,"type":54,"version":24,"maxContentLevel":27},"cfb82a91-b111-42d1-8a69-8b71901e5b40",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":487,"multiChoiceCorrect":489,"multiChoiceIncorrect":491},[488],"What type of missing data is it when there is a pattern or cause behind the missing data, but it is not directly related to the variable?",[490],"Missing at Random (MAR)",[492,493,494],"Missing Completely at Random (MCAR)","Missing Not at Random (MNAR)","Missing Systematically at Random (MSAR)",{"id":496,"data":497,"type":37,"version":24,"maxContentLevel":27,"pages":499},"ea926309-5f75-49b5-a524-3c723bd978fa",{"type":37,"title":498},"Handling Missing Data",[500,514,530,544,562],{"id":501,"data":502,"type":24,"maxContentLevel":27,"version":24,"reviews":506},"70f3805f-a464-45b5-a822-5db8c79dfffe",{"type":24,"title":503,"contentRole":37,"markdownContent":504,"audioMediaId":505},"What should be done with Missing At Random data?","\nMissing at random data can generally be left as it is because the pattern behind the missing data doesn’t have anything to do with the variable of interest itself, for example income. Instead the pattern is based on an unrelated variable, like age or gender. \n\n ![Graph](image://c894e7ea-898c-4967-862d-e27d06b02f0a \"Most MAR data can be treated as normal, unless there is a correlation with the variable of interest\")\n\nHowever, even if that variable is theoretically unrelated, it might still be correlated to the variable of interest in some way. As an example, older men might be more likely to earn higher salaries. As a result, we need to be careful handling MAR data.\n\n","255e5203-ef57-4dca-a1a7-a76272763948",[507],{"id":508,"data":509,"type":54,"version":24,"maxContentLevel":27},"b507893f-44ae-4625-af65-81fa5f411d1e",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":510,"clozeWords":512},[511],"When dealing with Missing At Random data, we need to be careful to consider if the missing data is correlated to the variable of interest.",[513],"correlated",{"id":515,"data":516,"type":24,"maxContentLevel":27,"version":24,"reviews":520},"aa632633-3411-45c3-ac5c-cd162dfdfa51",{"type":24,"title":517,"contentRole":37,"markdownContent":518,"audioMediaId":519},"Imputing and Removing Missing Data","\nIf your data is Missing Completely at Random (MCAR) or Missing at Random (MAR) then you generally don’t need to remove or impute, and can proceed with your analysis. \n\nHowever, if you have Missing Not at Random (MNAR) data, then it’s possible your sample isn’t representative of the population, and you need to either remove data listwise, pairwise deletion, or impute data via calculating the mean or median and imputing that as the value for any missing values. \n\nImputing just means that you fill the empty space for that variable with a value like the mean or median. \n\n ![Graph](image://f31f9654-ee08-443b-97d0-0423d92bd8ea \"Missing data indicated by NaN\")\n\nLike in the example below, where we take the average value of each column, and add that into the missing data cells, which are indicated by the ‘NaN’ value. \n\n ![Graph](image://58babd2f-af82-484f-8b89-2cce809bf881 \"Imputing missing data with mean values\")\n\n\nConsider for example, in column three ‘col3’ the mean of 3 and 9 is six. Therefore, the number 6 gets imputed into the cell with the missing value in the bottom row of col3. ","306d09b7-1fca-4173-b09e-e3e7d43fbbd1",[521],{"id":522,"data":523,"type":54,"version":24,"maxContentLevel":27},"27763c64-ce78-4f0c-b339-8d9f5f41ba90",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":524,"binaryCorrect":526,"binaryIncorrect":528},[525],"What does it mean to impute data?",[527],"To fill the empty space for that variable with a value like the mean or median",[529],"To remove data using listwise or pairwise deletion",{"id":531,"data":532,"type":24,"maxContentLevel":27,"version":24,"reviews":536},"3e066b4e-c3de-4597-9af7-86ccaef65bdd",{"type":24,"title":533,"contentRole":37,"markdownContent":534,"audioMediaId":535},"Listwise Deletion ","\nListwise deletion is the most common method to use when removing data. In listwise deletion, you delete every single observation that has missing data. \n\nImagine your data is laid out in a table, with the rows being the people you’ve surveyed and the columns being the data categories you want to collect. \n\nListwise deletion would delete the entirety of any row that has an empty box. This means anybody who you surveyed who didn’t complete all of the questions would be removed from the sample. \n","ec24a7db-1b1b-46f8-b8ae-927f0d9fbb81",[537],{"id":538,"data":539,"type":54,"version":24,"maxContentLevel":27},"58548099-94fc-4aba-9db3-2c00d0f03772",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":540,"activeRecallAnswers":542},[541],"What is the most common method for removing data?",[543],"Listwise deletion",{"id":545,"data":546,"type":24,"maxContentLevel":27,"version":24,"reviews":550},"973f2856-9e61-4ff1-bac3-fcc08267bf9c",{"type":24,"title":547,"contentRole":37,"markdownContent":548,"audioMediaId":549},"Listwise deletion – bias and power","\nThe problem with listwise deletion is that it can create bias in our results. This is because the individuals or observations that are missing data may be different in some way from those that are not missing data. \n\n\n\nFor example, imagine we are studying the relationship between a person's height and their income. If taller people are more likely to leave their income blank on a survey, then using listwise deletion would make our sample of people with income data shorter on average, leading to bias in our results.\n\nPower refers to the ability of a statistical analysis to detect a real effect if one exists. When we use listwise deletion, we are throwing away a lot of data, which can decrease the power of our analysis. This means that even if there is a real relationship between height and income, our analysis may not be able to detect it because we have less data to work with.\n\nThis kind of bias is only created when there is a pattern behind the missing data. If the data is Missing Completely At Random (known as MCAR), then listwise deletion won’t cause any biases.\n","fc09d477-4f8e-4654-8ded-94ee05ff6061",[551],{"id":552,"data":553,"type":54,"version":24,"maxContentLevel":27},"cb04357b-122d-4d49-9d9c-c38546054928",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":554,"multiChoiceCorrect":556,"multiChoiceIncorrect":558},[555],"What is the consequence of missing data when using listwise deletion?",[557],"Bias in results",[559,560,561],"Increased power","MCAR data","No consequence",{"id":563,"data":564,"type":24,"maxContentLevel":27,"version":24,"reviews":568},"af79d952-5d94-4598-a1e8-b87be1a10662",{"type":24,"title":565,"contentRole":37,"markdownContent":566,"audioMediaId":567},"Pairwise Deletion ","\nIt might be helpful to think of pairwise deletion as ‘Available Case Analysis’, because when you analyse a relationship between variables, you take every observation that has available data for all of your variables of interest, and leave the rest. \n\n ![Graph](image://fef2bbe5-dc14-4892-b84f-befd5fd0edf4 \"A table ready for pairwise deletion\")\n\nUsing the example above, to analyse the relationship between Weight and Lung Capacity, you can use observations 1 and 2 because only these observations have data for both those variables. \n\nBut to analyse Height and Lung Capacity you can use only observations 1 and 3. And for analysis of Height and Weight, only observation 1 has data for both those variables. \n\nWhen you have datasets with missing observations scattered across different columns, the number of observations in each analysis can vary greatly. Moreover, every sample used is different, as in the example above. \n\nFun fact: observation 1 is based on British Olympic Rower, Peter Reed OBE, said to have the largest recorded lung capacity (at least as of 2022) at 11.86 liters. Given that the average lung capacity is six liters, that’s quite impressive. But, you can see how having a superhuman like Peter in one analysis but not in another could affect your data, and add bias to your results. \n\n","85442f39-5fd8-473a-bd6e-efb61910cf45",[569],{"id":570,"data":571,"type":54,"version":24,"maxContentLevel":27},"220bf02d-a995-4ab2-b944-7e0229a374fb",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":572,"multiChoiceCorrect":574,"multiChoiceIncorrect":576},[573],"What is the name of the analysis method that uses only observations with available data for all of the variables of interest?",[575],"Available Case Analysis",[577,578,579],"Complete Case Analysis","Partial Case Analysis","Missing Case Analysis",{"id":581,"data":582,"type":37,"version":24,"maxContentLevel":27,"pages":584},"cda52764-995c-43aa-bf3d-90d951c6d8c9",{"type":37,"title":583},"Dealing with Data Issues",[585,603],{"id":586,"data":587,"type":24,"maxContentLevel":27,"version":24,"reviews":591},"99345bea-6b0a-414e-88e2-ca7dde3a79c3",{"type":24,"title":588,"contentRole":37,"markdownContent":589,"audioMediaId":590},"Truncated Data","\nWhen it comes to your data, you can never be too cautious, because inaccurate data can pop up in the most unexpected ways. One such example is ‘truncated data’, which is data that has been cut off from your dataset. It’s hard to see, because, well, it’s not there! \n\nTruncating data means that values above or below a cutoff have been excluded. For example, if you are collecting data on salary ranges within a company but only record people earning above $30,000, your data would be truncated at $30,000.","b35b43ec-4ecf-46e4-81a1-c67a66d0ff1b",[592],{"id":593,"data":594,"type":54,"version":24,"maxContentLevel":27},"d615f924-707a-4060-900e-1b2f56bca185",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":595,"multiChoiceCorrect":597,"multiChoiceIncorrect":599},[596],"What is the term used to describe data that has been cut off from a dataset?",[598],"Truncated data",[600,601,602],"Censored data","Omitted data","Abbreviated data",{"id":604,"data":605,"type":24,"maxContentLevel":27,"version":24,"reviews":609},"b99a7f7b-dff3-4141-9e54-548212dd1bbb",{"type":24,"title":606,"contentRole":37,"markdownContent":607,"audioMediaId":608},"Inaccurate and Censored Data"," \nWhat exactly is ‘Inaccurate data’? Imagine that you’re gathering information on car owners and their pets. Your hypothesis is that people who drive Teslas are more likely to own Labradors. \n\nBut, inadvertently, in your ‘dog breed’ column, you seem to have a lot of some weird new dog breed called ‘Model 3’, which is a model of Tesla. That’s inaccurate data. Inaccurate data can be caused by poor data entry, poor data measurement or due to unconscious biases of the person collecting the data.\n\n\n ![Graph](image://27e010a6-0de8-402d-9529-e63861995436 \"A Tesla Model 3 (not a dog)\")\n\nCensored data is a form of inaccurate data. It will show in your dataset as a range. For example, you could have a recorded height in centimetres listed as ‘>200’. This can happen because your measurement instrument might not actually measure higher than that. Alternatively, maybe your measuring tape ran out.\n\n","9ed7b0ee-02db-4548-8673-8d747c1a5074",[610],{"id":611,"data":612,"type":54,"version":24,"maxContentLevel":27},"0c2c2ae1-ac41-4934-9309-47adfe904308",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":613,"activeRecallAnswers":615},[614],"What is meant by 'inaccurate data'?",[616],"Data that is incorrect, incomplete, or distorted",{"id":618,"data":619,"type":29,"maxContentLevel":27,"version":37,"orbs":622},"c2b3ccfe-3844-4714-8d95-d60337cd45ba",{"type":29,"title":620,"tagline":621},"Properties of Your Data","Learn the fundamental metrics required to interpret your data like a pro.",[623,701,762],{"id":624,"data":625,"type":37,"version":37,"maxContentLevel":27,"pages":627},"2b0370cf-9a75-4253-901b-17c96c20b174",{"type":37,"title":626},"Measures of Central Tendency",[628,644,660,666,683],{"id":629,"data":630,"type":24,"maxContentLevel":27,"version":37,"reviews":634},"29c1b45f-7108-47d6-abd7-c4242f0984b8",{"type":24,"title":631,"contentRole":37,"markdownContent":632,"audioMediaId":633},"Measures of central tendency","The three main measures of central tendency  – meaning the methods for establishing the distribution of your data – are the mean, the median, and the mode. Now, while they all tell you about the central point of your dataset, they’re also all very different. So, how can your data have three different centers?\n\n![Graph](image://46db1880-02c0-440b-897a-a6aa03b63be5 \"The three measures of central tendency\")\n\nIn short, the mean is what most people call the average, the median is the middle value, and the mode is the most commonly or frequently observed value.","076a616d-30f4-441a-9735-c708dce6f46c",[635],{"id":636,"data":637,"type":54,"version":24,"maxContentLevel":27},"6b39c1b5-21ae-4208-9151-f32b17de4ccc",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":638,"binaryCorrect":640,"binaryIncorrect":642},[639],"What is the most commonly or frequently observed value in a dataset?",[641],"Mode",[643],"Mean",{"id":645,"data":646,"type":24,"maxContentLevel":27,"version":37,"reviews":650},"e3b566ae-6320-430f-a16a-cb4f51abf7fe",{"type":24,"title":647,"contentRole":37,"markdownContent":648,"audioMediaId":649},"Mean and Median","The mean is what most people call the average. It is the sum of all values divided by the number of observations. For example, add everybody’s height up like — 185 + 175 + 194 — and then divide it by the number of people — 3.\n\n![Graph](image://665aabd5-5920-42bc-a2fe-c02954362734 \"The mean equation\")\n\nThe median is the measurement or value in the exact middle of your data when you order your data from low to high. For example, by lining all your friends up from shortest to tallest, and then taking the person in the middle — their height is your median.","cea09611-24cc-44ff-a3ca-1c932ba63f11",[651],{"id":652,"data":653,"type":54,"version":24,"maxContentLevel":27},"520be108-c103-428b-a2c5-b07fb8c9ae05",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":654,"binaryCorrect":656,"binaryIncorrect":658},[655],"How is the median determined?",[657],"By ordering the data from low to high and taking the value in the middle",[659],"By adding all the values and dividing by the number of observations",{"id":661,"data":662,"type":24,"maxContentLevel":27,"version":37},"e5f784a2-e270-450e-9b80-a59b0ce8bd4c",{"type":24,"title":663,"contentRole":37,"markdownContent":664,"audioMediaId":665},"Calculating the Median","There are actually two different formulas for calculating the median depending on whether the number of observations in your dataset is odd or even.\n\nIf odd, then you take the number of observations in your dataset – n –  add one, and then divide that by 2.\n\n![Graph](image://b5b4ec02-8325-4e1f-b899-bd64b576e20a \"The formula for calculating the median\")\n\nIf it is even, then the formula is a little bit more complex. But we have it shown for you below. In reality, in data science, your statistical program will handle all the calculations for you.","00d4d13d-b541-4690-83b3-dc8ea9afb1dd",{"id":667,"data":668,"type":24,"maxContentLevel":27,"version":37,"reviews":671},"c06ab2ff-5417-4edb-b47e-ad5e49baea92",{"type":24,"title":641,"contentRole":37,"markdownContent":669,"audioMediaId":670},"The mode is the most popular value. If 80 percent of customers rate your store as 4 out of 5 stars, then 4 is the mode – because more people chose that value than any other value.\n\nBut it doesn’t have to be a majority. Perhaps you have 100 people, and you asked each to choose their favourite number. Let’s say that 95 people all chose a completely unique number, in that nobody else chose the number that they did.\n\nBut 5 people all chose 44. Only 5% of people chose that number, so it’s certainly not a majority. But, more people chose that number than any other number, so it is the plurality.","6c5a18dc-b9f8-4f06-9277-cb8c92921604",[672],{"id":673,"data":674,"type":54,"version":24,"maxContentLevel":27},"ecc808b8-119d-4913-bdc2-96920e106c2c",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":675,"multiChoiceCorrect":677,"multiChoiceIncorrect":679},[676],"If, out of ten people, 4 are French, 2 are Italian, 3 are British, and 1 is Indian, what nationality is the mode?",[678],"French",[680,681,682],"Italian","British","Indian",{"id":684,"data":685,"type":24,"maxContentLevel":27,"version":37,"reviews":689},"8618e35a-da79-4e76-b626-840e6a2e8ec7",{"type":24,"title":686,"contentRole":37,"markdownContent":687,"audioMediaId":688},"Advantages of using the median","The mean is heavily influenced by really large values when it comes to things like income, for example if Elon Musk walks into a small cafe, at his current net worth in 2023, the mean salary for everyone in the cafe would be billions of dollars.\n\nThis means the mean can be unhelpful in dealing with datasets with very large or very small values at the extreme ends.\n\nIn contrast, the median would be much less affected by Elon’s presence. This is why when we report income we tend to use the median, not the mean. If Elon entered the cafe, he would skew the distribution with his huge income, pushing the mean much higher than the median.\n\nOn the other hand, the mean is the most commonly used measure when a distribution is not skewed. When you see an average reported, it is most likely the mean. The mean is also needed for use in some statistical tests, where the median cannot be used. And the mean, not the median, is used to calculate standard deviation – a measure of how spread out your data is.\n\nFortunately it is easy to calculate both using statistical software or data science programming languages such as Python. So you’re not limited to using one or the other. It is however important to understand the difference, and when it is best to use the median.","de4fcbcb-810d-4023-aba9-a21aa7e1c086",[690],{"id":691,"data":692,"type":54,"version":24,"maxContentLevel":27},"9858798f-0d5b-41f7-b336-664a2abce5d4",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":693,"multiChoiceCorrect":695,"multiChoiceIncorrect":697},[694],"What is the main advantage of using the median over the mean when dealing with datasets with very large or very small values at the extreme ends?",[696],"The median is less affected by extreme values",[698,699,700],"The median is more accurate","The median is easier to calculate","The median is more commonly used",{"id":702,"data":703,"type":37,"version":24,"maxContentLevel":27,"pages":705},"cabcad3e-fde0-4387-9607-bd45057246e8",{"type":37,"title":704},"Understanding Skewness",[706,731,747],{"id":707,"data":708,"type":24,"maxContentLevel":27,"version":24,"reviews":712},"c67281d9-4b54-4264-a84b-216a32b65013",{"type":24,"title":709,"contentRole":37,"markdownContent":710,"audioMediaId":711},"Skew ","Skewness measures the symmetry of a distribution. For example, the normal distribution – otherwise known as the Gaussian, or Bell Curve – is a symmetrical distribution. This means it has zero skew, or very close to zero. So you are just as likely to find a value 30 points above the mean as you are 30 points below the mean. Normal distributions are found everywhere in nature and daily life - birth weight, job satisfaction and IQ all have a normal distribution.\n\n ![Graph](image://a1f27e80-8d70-4f8e-b4da-eb739fd12529 \"Positively skewed distribution\")\n\nBut many distributions are not symmetrical, meaning that they can skew to the left or the right. This can often be seen by the naked eye during data visualization, and there are other easy rules to test whether your distribution is skewed or not.\n\nIn a symmetrical distribution, the mean should be equal to the median – or at least pretty close to it. However, in a non-symmetrical distribution, the two things are likely to be very different.\n\n","cc031f91-2050-40c7-b09b-4d4b380ecd64",[713,724],{"id":714,"data":715,"type":54,"version":24,"maxContentLevel":27},"132ee59a-97c6-4cb0-9805-b4354c15d8c1",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":716,"multiChoiceCorrect":718,"multiChoiceIncorrect":720},[717],"What is a characteristic of a symmetrical distribution?",[719],"The mean is equal to the median",[721,722,723],"The mean is greater than the median","The mean is less than the median","The mean is not related to the median",{"id":725,"data":726,"type":54,"version":24,"maxContentLevel":27},"30dea2ba-388c-40ce-a18f-ca9c933780be",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":727,"clozeWords":729},[728],"In a symmetrical distribution, the mean should be equal to the median.",[730],"equal",{"id":732,"data":733,"type":24,"maxContentLevel":27,"version":24,"reviews":737},"7cedaf47-7a91-45d8-a9bf-c48f91d098d0",{"type":24,"title":734,"contentRole":37,"markdownContent":735,"audioMediaId":736},"The direction of skew","\nHow do you know which direction your dataset is skewed in? A good way to remember, is to look where the longer tail is pointing. If it’s pointing to the left, then your distribution is left (or negatively) skewed. If it’s pointing to the right, then it is right (or positively) skewed. \n\n\n ![Graph](image://d85f01bc-c4cd-4533-97f4-9f6a4b5f059e \"The different skews of curves\")\n\nAnother way is to look at the mean and median. If the mean is greater than median, then your distribution is right skewed. If the mean is less than the median, your distribution is left skewed. \n\nAn example of right skewed data in real life is income, because really rich people like Bill Gates and Elon Musk skew the distribution. \n\nOn the other hand, scores on an easy test might be left skewed if most people pass the test with a high score, and only a few people fail it with a score of less than 50%.\n\n","220a8c39-6998-406f-a30d-bcab9675300a",[738],{"id":739,"data":740,"type":54,"version":24,"maxContentLevel":27},"21e61463-0841-4cd5-bf3e-ef682abcc9ea",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":741,"binaryCorrect":743,"binaryIncorrect":745},[742],"What is an example of right skewed data in real life?",[744],"Income",[746],"Test scores",{"id":748,"data":749,"type":24,"maxContentLevel":27,"version":24,"reviews":753},"8e44de56-9015-4eae-ad0a-b7730ccf0e17",{"type":24,"title":750,"contentRole":37,"markdownContent":751,"audioMediaId":752},"Calculating skewness ","\nWhile you can generally visualize when your distribution is skewed, and use simple rules like checking if the mean is greater than the median, or vice versa, to find which direction the skew is in, that still doesn’t help you properly quantify skew. You don’t know how skewed your data really is.\n\nOne of the easiest ways to actually quantify skew – for interval and continuous data only – is Pearson’s median skewness. It’s an easy to understand measure, and you can even calculate it yourself. The equation is as follows:\n\nPearson’s median skewness =  3(Mean - Median)/Standard Deviation\n\nPearson’s median skewness tells you exactly how many standard deviations there are between the median and the mean. If the value is really close to 0 – between -0.4 and 0.4 – then you can consider that to be a symmetrical distribution, and not meaningfully skewed. \n\nIf your result is greater than 0 that means your data is positively skewed – right skewed. If it is less than 0 then it’s negatively skewed – left skewed. \n\n","e5cd7ffa-c44a-4049-8b4e-64cae881f899",[754],{"id":755,"data":756,"type":54,"version":24,"maxContentLevel":27},"8df7f91e-3bdc-48ab-bf3c-5eb7f3582041",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":757,"clozeWords":759},[758],"Pearson's median skewness is a way to quantify the skew of your data.",[760,761],"skewness","skew",{"id":763,"data":764,"type":37,"version":24,"maxContentLevel":27,"pages":766},"7425c022-d63e-4e74-a3a5-904cdd2cacee",{"type":37,"title":765},"Variance and Standard Deviation",[767,783,799,813,827,843],{"id":768,"data":769,"type":24,"maxContentLevel":27,"version":24,"reviews":773},"b2634243-bfa4-4793-92b0-8b5d7c747c0d",{"type":24,"title":770,"contentRole":37,"markdownContent":771,"audioMediaId":772},"Sample variance ","\nVariance, as the name might suggest, measures the amount of variation in your data. By that, we mean how far your values are from the mean, on average. Variance shows you how ‘spread out’ your data is. \n\nA dataset with a high variance has a wide range of values, whereas a dataset with a low variance has a narrow range of values. If you take the age of everyone in a primary school class, then the variance will likely be low. However if you take the age of everyone in a company it’ll likely have higher variance. \n","48e04acd-3348-4c14-a9e6-09f02e76725c",[774],{"id":775,"data":776,"type":54,"version":24,"maxContentLevel":27},"2e9a3435-3a1d-41a2-8d08-d577dda89979",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":777,"binaryCorrect":779,"binaryIncorrect":781},[778],"How does variance measure the amount of variation in data?",[780],"By showing how far values are from the mean, on average",[782],"By showing how close values are to the mode, on average",{"id":784,"data":785,"type":24,"maxContentLevel":27,"version":24,"reviews":789},"612cb726-8270-4bf6-9dc5-bae9522430ec",{"type":24,"title":786,"contentRole":37,"markdownContent":787,"audioMediaId":788},"Calculating sample variance ","\nTo calculate the variance of a data set, you need to:\n\n1. Calculate the mean of the data set by adding all the values together and dividing by the number of values.\n\n2. Subtract the mean from each value in the data set and square the result.\n\n3. Add up all the squared differences.\n\n4. Divide the sum by the number of values in the data set minus 1.\n\nThis gives you the variance of the data set. The equation looks like this:\n\n\n ![Graph](image://5b0aa1e9-b3bd-44de-8ef3-f8bdec6c3c69 \"The sample variance formula\")\n\n","6db050ec-1add-4582-8e47-aa9001e5bb39",[790],{"id":791,"data":792,"type":54,"version":24,"maxContentLevel":27},"e5b0423d-c5e0-4ca3-bce8-48251c92742b",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":793,"binaryCorrect":795,"binaryIncorrect":797},[794],"What equation is used to calculate the variance of a data set?",[796],"(Σ(x-μ)^2)/(n-1)",[798],"(Σ(x-y)^2)/(n^2+1)",{"id":800,"data":801,"type":24,"maxContentLevel":27,"version":24,"reviews":805},"376d7b52-3235-48e0-becd-02411fd07874",{"type":24,"title":802,"contentRole":37,"markdownContent":803,"audioMediaId":804},"Sample Standard Deviation","\nThe standard deviation is another measure of the spread of a dataset, and relies on knowing the variance of your data. Specifically, the standard deviation is how far your values are away from the mean, on average. In short, it measures the dispersion of your data. \n\nThe key difference between standard deviation and variance is that the results of your variance calculation are presented as a squared value, where as standard deviation is in the same units as your data. Once you understand one, it’s easy to understand the other. \n","c8a7bbe5-96b9-404e-80df-61735d212867",[806],{"id":807,"data":808,"type":54,"version":24,"maxContentLevel":27},"072bdf50-dede-44cb-9b05-c110b7075a7a",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":809,"activeRecallAnswers":811},[810],"How does standard deviation measure the spread of a dataset?",[812],"It tells you how far values are away from the mean, on average",{"id":814,"data":815,"type":24,"maxContentLevel":27,"version":24,"reviews":819},"ce0eb00b-2874-42d4-839f-c7afbc6bc0e8",{"type":24,"title":816,"contentRole":37,"markdownContent":817,"audioMediaId":818},"Calculating Sample Standard Deviation","So, standard variance is calculated by finding the average distance of all your values from the mean. The standard deviation is calculated by taking the square root of variance.  \n\nSo, if the equation for variance looks like this:\n\n ![Graph](image://8abf1377-d008-4b69-addc-9368229cc1cb \"Standard variance\")\n\n\nStandard Deviation is the square root of variance. So the equation looks like this:\n\n ![Graph](image://07661de7-bc49-4bf3-90c2-db6c7bd1b92a \"Standard deviation\")\n\nThe formula for standard deviation is \n\nsqrt(sum((x - mean)^2) / (n - 1))\n\nHopefully you recognize a similar version of this formula was used to calculate the variance. All we do is find the square root of the variance calculation to find the standard deviation.\n\n","eb5d7776-a011-4ce0-8e6a-110f00bb4744",[820],{"id":821,"data":822,"type":54,"version":24,"maxContentLevel":27},"dd611b91-334b-4c24-b568-52951c013dad",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":823,"clozeWords":825},[824],"The formula for standard deviation is sqrt(sum((x - mean)^2) / (n - 1)).",[826],"standard deviation",{"id":828,"data":829,"type":24,"maxContentLevel":27,"version":24,"reviews":833},"310a8433-2645-4cb1-9863-33ac7be5a4c0",{"type":24,"title":830,"contentRole":37,"markdownContent":831,"audioMediaId":832},"The difference between variance and standard deviations ","\nVariance and standard deviation are basically siblings. It’s just that one – standard deviation – is a lot easier to interpret, so you and everyone can understand what it means, it makes preparing and interpreting results much easier. \n\nThe standard deviation is just the square root of variance. If you’re wondering ‘well what is the point of that? If you have one why do you need the other, they both tell you the spread of the distribution?’. \n\nBut, by taking the square root of the variance you get a value – the standard deviation – that is in the same measurement units as your original values – for example minutes, seconds, centimeters, or inches. \n\n ![Graph](image://c55cd3fc-952e-4ed7-8a83-e148a2847975 \"Standard deviations plotted as a curve\")\n\nSo, if you want a measure that speaks your language, you should use the standard deviation. It makes interpretation and reporting much easier! For example, it will enable you to report that the mean height of giraffes in Africa was 5.5 meters, with a standard deviation of 0.5 meters. This gives us a quantifiable number on how the data is distributed. Plus, having everything in the same measurement units just makes things easier. \n\nThat doesn’t mean you don’t need variance, though. As an example, it is used in some statistical tests to test whether two samples might come from different populations. \n\n","c60d0c2d-3fb9-4142-afb0-e618a6201bf5",[834],{"id":835,"data":836,"type":54,"version":24,"maxContentLevel":27},"28b04bc8-c154-493d-88b9-42de7db5ea26",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":837,"binaryCorrect":839,"binaryIncorrect":841},[838],"What is the benefit of using standard deviation instead of variance?",[840],"It is in the same measurement units as the original values.",[842],"It is easier to interpret.",{"id":844,"data":845,"type":24,"maxContentLevel":27,"version":24,"reviews":849},"6a4ea781-47da-434b-82d0-ac45cb2a4d65",{"type":24,"title":846,"contentRole":37,"markdownContent":847,"audioMediaId":848},"The empirical rule ","\nThe empirical rule tells you that for a normal distribution, 68% of all values are within 1 standard deviation of the mean, 95% of all values are within 2 standard deviations of the mean, and 99.7% of all values are within 3 standard deviations of the mean. \n\nThe empirical rule is helpful because if you know that your data is normally distributed, and you know your sample mean, and your standard deviation, you can begin making predictions about outcome probability. \n\n ![Graph](image://f15a688b-b72c-4299-ad1b-466d58b740fe \"A herd of zebras\")\n\nFor example, if you have a herd of zebras at the zoo, and they live 20 years on average, with a standard deviation of 5 years, you can begin to understand the probability that a zebra will live beyond a certain age. \n\nDue to the fact that 95% of values fall within two standard deviations of the mean, you can subtract 2 times the standard deviation from the mean – 20 - 10 = 10 – and likewise add two standard deviations to the mean – 20 + 10 = 30 – to discover that, based on your data, it is likely that 95% of Zebras will live between 10 and 30 years. \n\nQuestions such as these are popular on statistics exams.\n","e878bfb2-138e-417f-8438-36f26c9f439d",[850],{"id":851,"data":852,"type":54,"version":24,"maxContentLevel":27},"ae6cd919-720b-468c-91a2-8761ae5e6a27",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":853,"clozeWords":855},[854],"The empirical rule is helpful for predicting outcome probability based on a normal distribution's mean and standard deviation.",[856],"outcome probability",{"id":858,"data":859,"type":29,"maxContentLevel":27,"version":24,"orbs":862},"d7fe1682-6380-4d51-806b-8377cac70ad0",{"type":29,"title":860,"tagline":861},"Probability Functions ","Step into the world of probability distributions – learn how real world events are modeled and visualized.",[863,932,981,1039],{"id":864,"data":865,"type":37,"version":24,"maxContentLevel":27,"pages":867},"c373b0c3-1efa-4387-8968-8572b1302699",{"type":37,"title":866},"Understanding Probability Distributions",[868,886,892,917],{"id":869,"data":870,"type":24,"maxContentLevel":27,"version":24,"reviews":874},"6ea8e463-408e-4430-b614-43347db32703",{"type":24,"title":871,"contentRole":37,"markdownContent":872,"audioMediaId":873},"What are probability distributions? ","Probability distributions show you the spread of results from a sample population – the lowest and highest values observed and everything in between – as well as the likelihood of observing a particular value – for example, a person who is 175cm tall.\n\n ![Graph](image://654e8a4c-5953-4f4e-98c6-5d8df4c3f2f5 \"Standard normal distribution\")\n\nThe higher the line is on the y-axis – the vertical line – means the more values with that value on the x-axis – the horizontal line – were counted. So, it shows how frequently that value was observed.\n\nImagine you measured the height of everyone in your town. You would have a lot of people who were average height, and a few who were really tall or really short. A probability distribution allows you to easily visualize all this data. \n\n","0017550f-1c4d-4d92-b6bf-1f9fa26abcda",[875],{"id":876,"data":877,"type":54,"version":24,"maxContentLevel":27},"2da6a2ab-7b6e-444c-a57d-927434ddaf26",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":878,"multiChoiceCorrect":880,"multiChoiceIncorrect":882},[879],"What does a probability distribution allow you to do with data?",[881],"Visualize it",[883,884,885],"Calculate it","Analyze it","Summarize it",{"id":887,"data":888,"type":24,"maxContentLevel":27,"version":24},"74f1eb89-3af9-4c4a-b303-d2693599d917",{"type":24,"title":889,"contentRole":37,"markdownContent":890,"audioMediaId":891},"The Central Limit Theorem "," ![Graph](image://1ccdde9c-190f-4187-bb41-b1a66dd7b222 \"The Central Limit Theorem\")\n\nThe central limit theorem states that no matter what the probability distribution of population from which the 100 observations came, the distribution of the sample means will always be a normal distribution. This allows us to make inferences about the population’s mean and standard deviation – and even conduct statistical tests that require data be normally distributed. \n\n","27c84791-0f3a-453b-bf79-ae7d0c9d75b1",{"id":893,"data":894,"type":24,"maxContentLevel":27,"version":24,"reviews":898},"95c4b40b-b071-4fa8-99b9-b95f1958199b",{"type":24,"title":895,"contentRole":37,"markdownContent":896,"audioMediaId":897},"How the Central Limit Theorem works"," ![Graph](image://322c9990-4c59-4258-8357-92dbfde5948a \"The Central Limit Theorem\")\n\nIt’s time to picture the Central Limit Theorem in action. Imagine that we have a population of 2000, and we’re going to tackle a sample from that population. Let’s say we have a sample size of 100 - just 5% of the population - and measure the length of human index fingers. \n\nWe’re going to record the mean of that sample. Then we’re going to take another sample of 100 random observations, and record the mean for that too. But, we have to put the first 100 samples we took back into the population: that’s called ‘sampling with replacement’. We do this lots of times, at least 30. And we end up with a lot of sample means. \n\nIf we plot these sample means as a distribution, no matter what shape the initial distribution was, we will end up with a normally distributed set of sample means. \n\n","7a76c0fd-4270-4cec-960b-bd37149b7d9d",[899,910],{"id":900,"data":901,"type":54,"version":24,"maxContentLevel":27},"7a57fa1b-6a87-433f-bfe2-70c979444325",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":902,"multiChoiceCorrect":904,"multiChoiceIncorrect":906},[903],"What is the process called when a sample is taken from a population and then put back into the population?",[905],"Sampling with replacement",[907,908,909],"Sampling without replacement","Sampling with exclusion","Sampling with inclusion",{"id":911,"data":912,"type":54,"version":24,"maxContentLevel":27},"d944526c-d996-43f5-acbf-17d9120777e8",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":913,"clozeWords":915},[914],"If we take samples from a population, and plot the sample means as a distribution, it will be normally distributed.",[916],"normally",{"id":918,"data":919,"type":24,"maxContentLevel":27,"version":24,"reviews":923},"5fee7e6d-4abe-4c98-8cee-f35278e5334c",{"type":24,"title":920,"contentRole":37,"markdownContent":921,"audioMediaId":922},"Probability Mass Function ","\nThe Probability Mass Function -PMF- tells you the probability of observing a particular discrete value. \n\nAn example of a discrete value might be whether a family has 5 members or that it has 6. However, you won’t get a number in between, like 5.54, because that would be a continuous variable, and you can’t have 5.54 people.\n\nUnlike the Cumulative Distribution Function, the Probability Mass Function (PMF) doesn’t tell you the probability of seeing a value that is X or less. When it comes to a PMF, if you choose a value on the X-axis – the horizontal line – let’s say you chose 9, and find its corresponding Y-value – on the vertical line – then you have the probability that you will see a family of exactly 9 people out on your walk. \n\nIf you did the same on a Cumulative Distribution Function, you would get the probability that you saw a family of 9 people or less – which is much more likely than seeing a family of exactly 9 people.\n","d77341e6-c1de-4dfa-a521-6390b8646467",[924],{"id":925,"data":926,"type":54,"version":24,"maxContentLevel":27},"d682eb83-68ca-40d1-8b89-e4e9fca705bd",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":927,"clozeWords":929},[928],"The Probability Mass Function tells you the probability of observing a particular discrete value, while the Cumulative Distribution Function tells you the probability of seeing a value that is X or less.",[930,931],"Probability Mass Function","Cumulative Distribution Function",{"id":933,"data":934,"type":37,"version":24,"maxContentLevel":27,"pages":936},"8f724126-e8a9-4224-b628-fc978922ad1b",{"type":37,"title":935},"Central Limit Theorem and Its Applications",[937,953,967],{"id":938,"data":939,"type":24,"maxContentLevel":27,"version":24,"reviews":943},"ef8391ee-949b-47f1-b9a7-cf923ede7bc8",{"type":24,"title":940,"contentRole":37,"markdownContent":941,"audioMediaId":942},"Probability Density Function ","\nThe Probability Density Function – PDF – tells you the likelihood that you will observe a variable with a certain value – like a basketballer with a height of 210cm – within a population – like all basketballers in the USA. \n\n\n ![Graph](image://9197abe3-2ed6-466b-b319-6470ae82368f \"Probability Density Function (left)\")\n\n\nThe population doesn’t have to be every living human. It just needs to be a small sample of your population of interest – although the more observations you have, the better – as long as they are randomly sampled. \n\nThe Probability Density Function is used only for continuous variables like a person’s weight – for example, your friend that is 70.54kg. In comparison, an example of a discrete variable is a six-sided die – it can land on 1 or 2, but not 1.5. \n\nSometimes the PDF and the Probability Mass Function ‘PMF’ get mixed up. They’re similar, just used for different types of variables. The PMF is used only to describe discrete probability distributions, the PDF for continuous probability distributions. \n\n\n","6333d7f2-7819-449c-a370-5d05f5df68a1",[944],{"id":945,"data":946,"type":54,"version":24,"maxContentLevel":27},"df928f38-899e-486c-be8e-4feab472e6ab",{"type":54,"reviewType":37,"spacingBehaviour":24,"binaryQuestion":947,"binaryCorrect":949,"binaryIncorrect":951},[948],"What is the difference between the Probability Density Function and the Probability Mass Function?",[950],"The PDF is used for continuous variables, the PMF for discrete variables.",[952],"The PDF is used for discrete variables, the PMF for continuous variables.",{"id":954,"data":955,"type":24,"maxContentLevel":27,"version":24,"reviews":959},"0ea0416e-b115-4c13-8282-19f7f8590ef1",{"type":24,"title":956,"contentRole":37,"markdownContent":957,"audioMediaId":958},"Cumulative Distribution functions ","\nCumulative Distribution Functions – CDF –  don’t tell you the probability of observing a certain value on the X-axis – like a Probability Density Function ‘PDF’. Rather, Cumulative Distribution Functions tell you the probability of observing that value or lower. \n\n ![Graph](image://ab031f26-43d5-4d73-9b55-2130f105e02f \"Cumulative Distribution Function (right)\")\n\nLet’s take rolling dice for example. If you roll a die many times and record each result, and create a CDF from your data, look at the 3 on the X-axis – the horizontal line – of the CDF and its corresponding value on the Y-axis – the vertical line. The number on the Y-axis is not the probability you will roll a 3, it’s the probability you will roll a 3 or lower. \n\nCumulative Distribution Functions ‘CDF’ can be used for both Discrete variables – like the numbers on a die that can be 1 or two but never 1.5 – and Continuous variables like your friend’s weight, that can be 60kg, 61kg, or anything in between like 60.17kg. \n\n","015ab4d5-32d6-4825-96ff-4f41b7547d0b",[960],{"id":961,"data":962,"type":54,"version":24,"maxContentLevel":27},"63f1d0ae-1d92-4c62-8edc-3ccc651b4c86",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":963,"activeRecallAnswers":965},[964],"What does a Cumulative Distribution Function (CDF) tell you about a value on the X-axis?",[966],"The probability of observing that value or lower",{"id":968,"data":969,"type":24,"maxContentLevel":27,"version":24,"reviews":973},"653d827f-91ba-4890-bf82-eb723623f180",{"type":24,"title":970,"contentRole":37,"markdownContent":971,"audioMediaId":972},"The Normal Distribution ","\nThe normal distribution is often called the ‘bell-shaped’ curve because it looks like, well, a bell. It is also known as the ‘Gaussian’ distribution, after mathematician Carl Friedrich Gauss.\n\n\n ![Graph](image://78bf0f97-e9e8-49af-aa5c-b3ab59ea9459 \"A Gaussian (normal) distribution\")\n\nLet’s think about human height for a moment. It’s pretty rare to see someone only 4ft tall right? They do exist, you just don’t see a lot of them. \n\nThe same goes for really tall people, like basketballers. \n\nMost people are average, or pretty close to average. \n\nHuman height is something that creates a bell curve when observations are randomly sampled from the population. \n\nYou have the left and right hand side tails. These represent the number of really short people and really tall people respectively. This is due to the fact that there aren’t as many of them. In fact, you probably have as many people who are extremely tall as ones who are extremely short. Then, you have a large mass in the center, which represents average people. The further away from the center you go, the rarer it is to find someone with that height. \n\n","67f4bf70-8c15-4a04-9f4e-603bc8a4990d",[974],{"id":975,"data":976,"type":54,"version":24,"maxContentLevel":27},"1251a40f-3711-46df-8439-8cdc297ed865",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":977,"activeRecallAnswers":979},[978],"What is the normal distribution often called?",[980],"Bell-shaped curve",{"id":982,"data":983,"type":37,"version":24,"maxContentLevel":27,"pages":985},"6263e5df-61a7-47ef-9dcd-23361699cbe9",{"type":37,"title":984},"Discrete Probability Distributions",[986,1003,1025],{"id":987,"data":988,"type":24,"maxContentLevel":27,"version":24,"reviews":992},"e33f356d-e9cf-4725-85d6-3bc5e3cf2413",{"type":24,"title":989,"contentRole":37,"markdownContent":990,"audioMediaId":991},"Binomial distribution","\nA binomial distribution is a type of probability distribution that is used to describe the number of successes in a fixed number of trials, where each trial has only two possible outcomes: success or failure. It is named after the word \"binomial\", which means \"two names\" in Latin, referring to the two possible outcomes of each trial.\n\n ![Graph](image://5fc47a2a-9f33-4d70-bfff-5e6ada071322 \"Coin flips are a classic binomial distribution\")\n\nFor example, let's say you flip a coin 10 times. The number of times the coin lands on heads can be described by a binomial distribution. Each flip is a trial, and the possible outcomes are heads or tails. The probability of getting heads on one flip is 0.5, and the probability of getting heads on 10 flips is determined by the binomial distribution.\n\nOne important thing to note is that in a binomial distribution, the trials are independent. This means that the outcome of one trial does not affect the outcome of another trial. For example, the fact that a coin landed on heads on the first flip does not affect the probability of it landing on heads on the second flip.\n\nThe binomial distribution is also characterized by two parameters, n and p. n is the number of trials, and p is the probability of success in each trial. Knowing these two parameters, we can calculate the probability of getting a certain number of successes in n trials. This can be useful in many real-life situations, such as in business, medicine, and engineering.\n\n\n","cb503819-8510-4ca6-96ab-08bbdafb0cdc",[993],{"id":994,"data":995,"type":54,"version":24,"maxContentLevel":27},"d686d7d3-cac9-4587-8e29-dc55f9af2d6f",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":996,"multiChoiceCorrect":998,"multiChoiceIncorrect":999},[997],"What is the name of the probability distribution that describes the number of successes in a fixed number of trials, where each trial has only two possible outcomes?",[989],[1000,1001,1002],"Gaussian distribution","Poisson distribution","Exponential distribution",{"id":1004,"data":1005,"type":24,"maxContentLevel":27,"version":24,"reviews":1009},"2e17da7a-48a2-4da2-9475-a34af1d2e34a",{"type":24,"title":1006,"contentRole":37,"markdownContent":1007,"audioMediaId":1008},"Assumptions for the binomial distribution","There are some assumptions for binomial distribution. \n\nFirstly, the population should be fixed – meaning you don’t have new balls sneaking in or out. Secondly, in a binomial distribution, the observations should be independent of each other. \n\nThat’s why we sample with replacement: by removing one ball from the bucket, you would change the sample from which you drew, and that affects the probability of drawing a ball of the same color from the bucket, because there’s one less in there now. Instead, you would put the ball back – replacing it  – and keep the sample size the same.\n\nFinally, we assume there are only two possible outcomes – only red and blue balls, or only heads or tails on a coin.\n","0929e6bd-bbd6-4878-9c5e-43c1bc30767e",[1010,1017],{"id":1011,"data":1012,"type":54,"version":24,"maxContentLevel":27},"07953980-335e-4992-a5a3-51776543cb3e",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":1013,"activeRecallAnswers":1015},[1014],"What assumption is made about the population in a binomial distribution?",[1016],"It should be fixed",{"id":1018,"data":1019,"type":54,"version":24,"maxContentLevel":27},"0901e994-2271-4a51-861b-9fba0de77584",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":1020,"clozeWords":1022},[1021],"In a binomial distribution, the population should be fixed, the observations should be independent, and there should be only two possible outcomes.",[1023,1024],"fixed","independent",{"id":1026,"data":1027,"type":24,"maxContentLevel":27,"version":24,"reviews":1031},"fe66ce18-0a14-460e-9d11-cd1bbea0b99f",{"type":24,"title":1028,"contentRole":37,"markdownContent":1029,"audioMediaId":1030},"Poisson process and distribution","\n\nA Poisson process is a type of random process used to model the number of events that occur within a certain time interval. It's named after the French mathematician Simeon Denis Poisson. The events can be anything, such as the number of phone calls received by a call center, the number of cars that pass by a certain point on the road, or the number of goals scored by a soccer team.\n\nThe Poisson process has two key features: the average rate at which events occur (lambda), and the fact that the time intervals between events are independent. For example, if a call center receives an average of 5 calls per minute, then the Poisson process would model the number of calls received in any given minute. The time between the calls received is independent, meaning the time between the first and second call does not affect the time between the second and third call.\n\nThe Poisson distribution is closely related to the Poisson process. It's a probability distribution that describes the number of events that occur within a certain time interval. It's determined by the lambda parameter, which is the average rate at which events occur. The Poisson distribution can tell us, for example, the probability of a call center receiving 6 calls in a minute, given that the average rate of calls is 5 per minute.\n","482f277b-c8a3-402f-a1a4-30b92ff58dd7",[1032],{"id":1033,"data":1034,"type":54,"version":24,"maxContentLevel":27},"b81586e5-2de5-4ba5-9a83-bbcf2a84640a",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":1035,"multiChoiceCorrect":1037,"multiChoiceIncorrect":1038},[1036],"What is the name of the probability distribution that describes the number of events that occur within a certain time interval?",[1001],[1000,989,1002],{"id":1040,"data":1041,"type":37,"version":24,"maxContentLevel":27,"pages":1043},"47695752-f327-400c-a850-fbac451f6ed8",{"type":37,"title":1042},"Specialized Probability Distributions",[1044,1058,1083],{"id":1045,"data":1046,"type":24,"maxContentLevel":27,"version":24,"reviews":1050},"46ceb25c-6330-4598-a779-8b6b9dfdd9b6",{"type":24,"title":1047,"contentRole":37,"markdownContent":1048,"audioMediaId":1049},"Weibull ","\nThe Weibull analysis is used to model the amount of time it would take for a process or event to occur. Unlike the Poisson process, which models the number of times an event occurs within a given time period, the Weibull process models the time it takes for an event to occur. This distribution shows how we know when spare parts will be needed for your new Toyota – before the existing ones fail. \n\n\n ![Graph](image://a66f6457-da86-444e-a588-fbd2c4803f1e \"Curves modelled with Weibull analysis\")\n\nWe have the Weibull to thank for much of our machinery running as smoothly as it does. Without it, we’d really just be guessing when plane parts needed to be replaced. It’s also how technology and other companies have a pretty good idea what kind of warranty they can offer without losing lots of money! \n\nWith the Weibull you can find out how likely it is your machinery will fail at a certain time, the average life of your parts, the rate of failure – how many times you can expect it to fail during a specified timeframe – and how likely it is that your product will still be working at a certain point in time.\n\n","8dc4cd4e-9304-43cb-9199-581fa00d083f",[1051],{"id":1052,"data":1053,"type":54,"version":24,"maxContentLevel":27},"182f4636-fa21-4f91-894d-d4b55b09e332",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":1054,"clozeWords":1056},[1055],"The Weibull analysis is used to model the amount it would take for a process or event to occur, and can be used to find the rate of failure.",[1057],"Weibull",{"id":1059,"data":1060,"type":24,"maxContentLevel":27,"version":24,"reviews":1064},"83b4d607-7691-4329-9364-75f1cb3e2026",{"type":24,"title":1061,"contentRole":37,"markdownContent":1062,"audioMediaId":1063},"Bernoulli distribution","\nThe Bernoulli distribution is another method of helping us model the probability of something happening or not happening. The Bernoulli distribution is a discrete probability distribution - meaning it can only predict discrete values. Discrete values are things like categorical variables, or even the numbers on a die. \n\n ![Graph](image://26babf14-6625-4225-94da-bf8fe1a700b7 \"Bernoulli distributions can only have two possible outcomes\")\n\nBut the Bernoulli distribution can only have one trial, and two possible outcomes. As an example, a single coin flip fits this requirement, as it only has two possible outcomes. But, a Bernoulli distribution can be anything with two outcomes, like success and failure. For example, if you set your success criteria as rolling a six with a die, and failure as anything other than 6, even though there are 6 possible outcomes, you can frame it as a Bernoulli trial. \n\nIt is important in Bernoulli trials that each outcome is independent – so the two outcomes can’t happen at the same time, and a future outcome can’t be affected by a previous outcome. This is like rolling a die or flipping a coin. \n\nThe Bernoulli distribution is really a calculation that enables you to calculate the probability of each outcome. \n\n","e6b87196-573f-4666-9078-55852b75aafd",[1065,1076],{"id":1066,"data":1067,"type":54,"version":24,"maxContentLevel":27},"2b554cb4-c725-4aaf-b9d0-fd46b0cc7879",{"type":54,"reviewType":27,"spacingBehaviour":24,"multiChoiceQuestion":1068,"multiChoiceCorrect":1070,"multiChoiceIncorrect":1072},[1069],"What is the Bernoulli distribution used to calculate?",[1071],"The probability of each outcome",[1073,1074,1075],"The probability of multiple outcomes","The probability of all outcomes","The probability of one outcome",{"id":1077,"data":1078,"type":54,"version":24,"maxContentLevel":27},"7960d174-246e-4188-89c7-49c0b6f60ed7",{"type":54,"reviewType":24,"spacingBehaviour":24,"activeRecallQuestion":1079,"activeRecallAnswers":1081},[1080],"What type of probability distribution is the Bernoulli distribution?",[1082],"Discrete probability distribution",{"id":1084,"data":1085,"type":24,"maxContentLevel":27,"version":24,"reviews":1089},"0ad7d53c-55b5-439b-9e8b-333fec5f9354",{"type":24,"title":1086,"contentRole":37,"markdownContent":1087,"audioMediaId":1088},"Pareto ","You might have heard of the Pareto Principle before. The Pareto principle states that 20% of the inputs are responsible for 80% of the results. That means 20% of employees do 80% of the work, or 20% of goldmines hold 80% of all the gold, or 20% of the diamonds account for 80% of the diamond mass in the world. \n\nThe distribution comes from Vilfredo Pareto – hence the name – who noticed that wealth distribution in Italy followed this rule. It was 20% of the landowners who owned 80% of the land. \n\n\nPareto distributions appear as heavily skewed to either the right or the left, and have heavy tails – very large outliers, like Elon Musk and his multi-billion dollar net worth for example.\n\nThe majority of us earn a lot less than Elon Musk, and a similar amount to one another, so we are all clustered together on the left hand side of the distribution, there are lots of people here so most of the area under the curve is here, too. But Elon Musk sits waaaaay out to the right, with his very large income, creating skew. \n","4e03f27a-db9f-4409-ae5d-3eb7c442296f",[1090],{"id":1091,"data":1092,"type":54,"version":24,"maxContentLevel":27},"88a20e03-3078-46a3-afdb-8c850580ea0f",{"type":54,"reviewType":55,"spacingBehaviour":24,"clozeQuestion":1093,"clozeWords":1095},[1094],"The Pareto Principle states that 20% of inputs are responsible for 80% of the results.",[1096,1097],"20%","80%",{"left":4,"top":4,"width":1099,"height":1099,"rotate":4,"vFlip":6,"hFlip":6,"body":1100},24,"\u003Cpath fill=\"none\" stroke=\"currentColor\" stroke-linecap=\"round\" stroke-linejoin=\"round\" stroke-width=\"2\" d=\"m9 18l6-6l-6-6\"/>",{"left":4,"top":4,"width":1099,"height":1099,"rotate":4,"vFlip":6,"hFlip":6,"body":1102},"\u003Cg fill=\"none\" stroke=\"currentColor\" stroke-linecap=\"round\" stroke-linejoin=\"round\" stroke-width=\"2\">\u003Cpath d=\"M12.586 2.586A2 2 0 0 0 11.172 2H4a2 2 0 0 0-2 2v7.172a2 2 0 0 0 .586 1.414l8.704 8.704a2.426 2.426 0 0 0 3.42 0l6.58-6.58a2.426 2.426 0 0 0 0-3.42z\"/>\u003Ccircle cx=\"7.5\" cy=\"7.5\" r=\".5\" fill=\"currentColor\"/>\u003C/g>",1778179494963]