2016年8月16日火曜日

開発環境

Pythonからはじめる数学入門 (Amit Saha (著)、黒川 利明 (翻訳)、オライリージャパン)の3章(データを統計量で記述する)、3.9(プログラミングチャレンジ)、問題3-3(他のCSVデータでの実験)を取り組んでみる。

問題3-3(他のCSVデータでの実験)

コード(Emacs)

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import matplotlib.pyplot as plt
from collections import Counter


def calculate_mean(numbers):
    s = sum(numbers)
    n = len(numbers)
    mean = s / n

    return mean


def calculate_median(numbers):
    n = len(numbers)
    numbers = sorted(numbers)

    if n % 2 == 0:
        n1 = n / 2
        n2 = n1 + 1
        i1 = int(n1) - 1
        i2 = int(n2) - 1
        median = (numbers[i1] + numbers[i2]) / 2
    else:
        n1 = (n + 1) / 2
        i = int(n1) - 1
        median = numbers[i]

    return median


def calculate_mode(numbers):
    c = Counter(numbers)
    mode = c.most_common(1)
    numbers_freq = c.most_common()
    max_count = numbers_freq[0][1]

    modes = []
    for num, count in numbers_freq:
        if count == max_count:
            modes.append(num)
        else:
            break

    return modes


def find_differences(numbers):
    mean = calculate_mean(numbers)
    diff = [num - mean for num in numbers]
    return diff


def calculate_variance(numbers):
    diff = find_differences(numbers)
    squared_diff = [d ** 2 for d in diff]
    variance = sum(squared_diff) / len(numbers)

    return variance


def calculate_standard_deviation(numbers):
    variance = calculate_variance(numbers)
    std_dev = variance ** (1 / 2)

    return std_dev


def draw_graph_diff(years, diffs):
    plt.figure()
    plt.plot(years, diff)
    plt.ylabel('差異')
    mean = calculate_mean(diff)
    median = calculate_median(diff)
    mode = calculate_mode(diff)
    variance = calculate_variance(diff)
    std_deviation = calculate_standard_deviation(diff)
    for y in [mean, median]:
        plt.plot(years, [y for _ in years])
        plt.legend(['差異', '平均値', '中央値'])
        pairs = [('平均', mean), ('中央値', median),
                 ('最頻値', ', '.join(map(str, mode))),
                 ('分散', variance), ('標準偏差', std_deviation)]
    for a, b in pairs:
        print('{0}: {1}'.format(a, b))


def draw_graph_population(x, y):
    plt.figure()
    plt.plot(x, y)
    plt.title('population')
    plt.xlabel('year')
    plt.ylabel('population')


def read_csv(filename):
    with open(filename) as f:
        reader = csv.reader(f)
        next(reader)
        years = []
        population = []
        for row in reader:
            years.append(row[0].split('-')[0])
            population.append(float(row[1]))

    years.reverse()
    population.reverse()

    return years, population


if __name__ == '__main__':
    filename = 'WORLDBANK-USA_SP_POP_TOTL.csv'
    years, population = read_csv(filename)
    draw_graph_population(years, population)
    diff = [population[i + 1] - population[i]
            for i in range(len(population) - 1)]
    years = years[1:]
    draw_graph_diff(years, diff)
    plt.show()

入出力結果(Terminal, IPython)

$ ./sample3.py
平均: 2559001.037037037
中央値: 2442000.0
最頻値: 2704000.0, 2128000.0, 2711301.0, 3358000.0, 2346000.0, 2375000.0, 2697365.0, 2847000.0, 2575528.0, 3405000.0, 2804000.0, 2062000.0, 2204000.0, 3116000.0, 3263000.0, 2862759.0, 1945000.0, 2210000.0, 2385453.0, 2390446.0, 2374575.0, 2414000.0, 3197000.0, 2482740.0, 2099000.0, 2257000.0, 2320000.0, 2677563.0, 3533000.0, 2152000.0, 3020000.0, 2241000.0, 2013000.0, 3152000.0, 1994000.0, 3207000.0, 1971000.0, 2119000.0, 2851295.0, 2156000.0, 2170000.0, 2359525.0, 2863313.0, 2209000.0, 2609000.0, 3122411.0, 2656238.0, 2470000.0, 2033000.0, 2647000.0, 2198000.0, 2235000.0, 3186000.0, 2806544.0
分散: 181810885149.48007
標準偏差: 426392.8765229082
$

0 コメント:

コメントを投稿