變數名稱 | 中文 | 說明 |
---|---|---|
reporter | 報導者 | |
news | 新聞 | |
reader | 讀者 | |
review | 評分 | 對新聞打分數 |
reviewer | 評分者 | 對新聞打分數的讀者 |
judge | 評價 | 對評分打分數 |
judger | 評價者 | 對評分打分數的讀者 |
score | 分數 | 即為可信度,同weight |
weight | 權重 | 即為可信度,同score |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
num = {
# 報導者人數
'reporter' : 8,
# 每個報導者生產新聞數
'news_per_reporter' : 40,
# 讀者人數
'reader' : 20,
# 讀者舊權重保留筆數
'reader_pass_weight_window' : 10
}
def get_readers(n):
readers = pd.DataFrame({
'reader_id' : np.arange(n)
})
return readers
get_readers(num['reader']).head()
%run src/scale.py
%run src/reader_weight_holder.py
ReaderWeightHolder(get_readers(num['reader']), Scale().mean, num['reader_pass_weight_window']).print_weights()
%run src/scale.py
%run src/reader_weight_holder.py
ReaderWeightHolder(get_readers(num['reader']), Scale().mean, num['reader_pass_weight_window']).print_past_weights()
%run src/helper.py
def get_reporters(n, scale):
# 常態分佈下的分數機率
z_scores = np.random.randn(n)
# 測試不公正的讀者
z_scores[0] = -2.5
# 換算成實際分數
scores = [scale.translateZ(z_score) for z_score in z_scores]
reporters = pd.DataFrame({
'reporter_score': scores,
'reporter_z_scores': z_scores
})
return index_as_id(reporters, 'reporter')
get_reporters(num['reader'], Scale()).head()
%run src/scale.py
%run src/helper.py
def get_news(per_reporter, scale, reporters):
# 報導者column
reporter_ids = list(reporters['reporter_id']) * per_reporter
# 報導者的可信度z分數
reporter_z_scores = list(reporters['reporter_z_scores']) * per_reporter
# 每則新聞的可信度z分數,定義為與報導者相關的常態分配~N(reporter.z_score, 1),所以後面要再加上報導者的可信度z分數
news_z_scores = np.random.randn(len(reporter_ids)) + reporter_z_scores
# 新聞的真實可信度分數
news_scores = [scale.translateZ(news_z_score) for news_z_score in news_z_scores]
news = pd.DataFrame({
'reporter_id' : reporter_ids,
'news_score' : news_scores,
'news_z_score': news_z_scores
})
return index_as_id(news, 'news')
def test_get_news():
scale = Scale()
reporters = get_reporters(num['reporter'], scale)
return get_news(num['news_per_reporter'], scale, reporters)
test_get_news().head()
%run src/scale.py
%run src/helper.py
def get_reviews(scale, reviewers, news):
# 每個評分者要評分的每則新聞
prepared = [[reviewer_id, n.news_id, n.reporter_id, n.news_score, n.news_z_score] for reviewer_id in reviewers['reader_id'] for n in news.itertuples(index = False)]
reviews = pd.DataFrame(prepared, columns = ['reviewer_id', 'news_id', 'reporter_id', 'news_score', 'news_z_score'])
# 評分與新聞的可信度相關
reviews['review_z_score'] = np.random.randn(len(reviews)) + reviews['news_z_score']
reviews['review_score'] = [scale.translateZ(review_z_score) for review_z_score in reviews['review_z_score']]
return index_as_id(reviews, 'review')
def test_get_reviews():
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
news = get_news(num['news_per_reporter'], scale, reporters)
return get_reviews(scale, readers, news)
test_get_reviews().head()
%run src/scale.py
%run src/helper.py
def get_judges(scale, reviewers, reviews, news):
# 每個評價者要評價的每則評分
prepared = [[review.review_id, review.review_score, review.news_score, review.reporter_id, judger_id] for review in reviews.itertuples(index = False) for judger_id in reviewers['reader_id']]
judges = pd.DataFrame(prepared, columns = ['review_id', 'review_score', 'news_score', 'reporter_id', 'judger_id'])
# 評價公式,評分的分數與實際的分數越接近,評價分數越高
judging = lambda review_score, real_score: scale.max - abs(review_score - real_score)
# 計算每則評分的評價分數
judges['judge_score'] = [judging(judge.review_score, judge.news_score) for judge in judges.itertuples(index = False)]
return index_as_id(judges, 'judge')
def test_get_judges():
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
return get_judges(scale, readers, reviews, news)
test_get_judges().head()
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def get_data(news, reviews, judges, reader_weight_holder):
'''合併在一起,方便後續計算'''
cols_to_use = lambda right, left: left.columns.difference(right.columns)
data = judges.merge(reviews[cols_to_use(judges, reviews)], left_on = 'review_id', right_index = True)
data = data.merge(news[cols_to_use(data, news)], left_on = 'news_id', right_index = True)
data['judger_weight'] = [reader_weight_holder.get(judger_id) for judger_id in data['judger_id']]
# 排除自己評自己的
data = data[data.judger_id != data.reviewer_id]
return data
def test_get_data():
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
judges = get_judges(scale, readers, reviews, news)
reader_weight_holder = ReaderWeightHolder(readers, scale.mean)
return get_data(news, reviews, judges, reader_weight_holder)
test_get_data().head()
def get_reviewer_weights(data):
judge_weight_averaging = lambda x: np.average(x['judge_score'], weights = x.judger_weight)
reviewer_weights = data.groupby('reviewer_id').apply(judge_weight_averaging)
return reviewer_weights
get_reviewer_weights(test_get_data()).head()
def update_reader_weight(data, reader_weight_holder):
# 附在原資料上,方便後續計算
data['reviewer_weight'] = [reader_weight_holder.get(reviewer_id) for reviewer_id in data['reviewer_id']]
return data
# 簡單平均
review_averaging = lambda x:np.average(x['review_score'])
# 加權平均
review_weight_averaging = lambda x: np.average(x['review_score'], weights = x.reviewer_weight)
def get_score(data, key, weighted = False):
averaging_func = review_weight_averaging if weighted else review_averaging
return data.groupby(key).apply(averaging_func)
# 計算新聞的可信度/權重
def get_news_score(data, weighted = False):
return get_score(data, 'news_id', weighted)
# 計算報導者的可信度/權重
def get_reporter_score(data, weighted = False):
return get_score(data, 'reporter_id', weighted)
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def simulate(times, scale, readers, reporters):
reader_weight_holder = ReaderWeightHolder(readers, scale.mean)
for i in range(times):
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
judges = get_judges(scale, readers, reviews, news)
data = get_data(news, reviews, judges, reader_weight_holder)
reviewer_weights = get_reviewer_weights(data)
reader_weight_holder.inserts(reviewer_weights.to_dict())
update_reader_weight(data, reader_weight_holder)
reporter_scores = get_reporter_score(data)
reporter_weighted_scores = get_reporter_score(data, weighted = True)
draw_reporter_scores(scale, data, reporters, reporter_scores, reporter_weighted_scores)
draw_user_weights(scale, reader_weight_holder.weights.values())
def run_simulate(times):
scale = Scale()
properties = {
'times' : times,
'scale' : scale,
'readers' : get_readers(num['reader']),
'reporters' : get_reporters(num['reporter'] , scale)
}
simulate(**properties)
run_simulate(10)
# 不公正的讀者
def get_unfair_readers(readers, percent = 0.2):
unfair_readers = readers.sample(frac = percent).sort_values(by = 'reader_id')
print('%d out of %d unfair readers with frac %.2f:' % (len(unfair_readers), len(readers), percent), list(unfair_readers['reader_id']))
return unfair_readers
# 不公正的讀者會給出的分數
def get_unfair_scores():
return [9, 10]
# 不公正的讀者所屬意的特定報導者
def get_target_reporters(reporters, percent = 0.2):
return reporters.iloc[0:1]
# 當不公正的讀者遇到所屬意的特定報導者寫出來的新聞,會一律評高分或低分
def fill_unfair_reviews(reviews, readers, reporters, unfair_readers, unfair_scores, target_reporters):
grep_unfair_reviews = reviews['reviewer_id'].isin(unfair_readers['reader_id'])
grep_unfair_reviews &= reviews['reporter_id'].isin(target_reporters['reporter_id'])
# 符合以上條件的評分
unfair_reviews = reviews[grep_unfair_reviews]
# 修改評分分數
reviews.loc[grep_unfair_reviews, 'review_score'] = np.random.choice(unfair_scores, len(unfair_reviews))
return reviews
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def simulate_unfair_review(times, scale, unfair_scores, readers, unfair_readers, reporters, target_reporters):
reader_weight_holder = ReaderWeightHolder(readers, scale.mean)
for i in range(times):
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
reviews = fill_unfair_reviews(reviews, readers, reporters, unfair_readers, unfair_scores, target_reporters)
judges = get_judges(scale, readers, reviews, news)
data = get_data(news, reviews, judges, reader_weight_holder)
reviewer_weights = get_reviewer_weights(data)
reader_weight_holder.inserts(reviewer_weights.to_dict())
update_reader_weight(data, reader_weight_holder)
reporter_scores = get_reporter_score(data)
reporter_weighted_scores = get_reporter_score(data, weighted = True)
draw_reporter_scores(scale, data, reporters, reporter_scores, reporter_weighted_scores)
draw_user_weights(scale, reader_weight_holder.weights.values())
def run_simulate_unfair_review(times, unfair_reader_percent = 0.4):
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
properties = {
'times' : times,
'scale' : scale,
'unfair_scores' : get_unfair_scores(),
'readers' : readers,
'unfair_readers' : get_unfair_readers(readers, unfair_reader_percent),
'reporters' : reporters,
'target_reporters' : get_target_reporters(reporters)
}
simulate_unfair_review(**properties)
run_simulate_unfair_review(10)
# 當不公正的讀者遇到所屬意的特定報導者寫出來的新聞的評分時,如果太低的話會評低分(打壓),高的話則是高分(讚賞)
def fill_unfair_judges(judges, readers, reporters, unfair_readers, target_reporters, scale):
grep_unfair_judges = judges['judger_id'].isin(unfair_readers['reader_id'])
grep_unfair_judges &= judges['reporter_id'].isin(target_reporters['reporter_id'])
# 符合以上條件的評分
unfair_judges = judges[grep_unfair_judges]
def get_unfair_judge_scores(judge):
unfair_scores = get_unfair_scores()
if judge.review_score >= scale.mean: # 如果大於等於平均
return np.random.choice(unfair_scores) # 就給予極度高分
return scale.max - np.random.choice(unfair_scores) + 1 # 否則就給予極度低分
# 修改評分分數
judges.loc[grep_unfair_judges, 'judge_score'] = [get_unfair_judge_scores(judge) for judge in unfair_judges.itertuples()]
return judges
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def simulate_unfair_judges(times, scale, readers, unfair_readers, reporters, target_reporters):
reader_weight_holder = ReaderWeightHolder(readers, scale.mean)
for i in range(times):
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
judges = get_judges(scale, readers, reviews, news)
judges = fill_unfair_judges(judges, readers, reporters, unfair_readers, target_reporters, scale)
data = get_data(news, reviews, judges, reader_weight_holder)
reviewer_weights = get_reviewer_weights(data)
reader_weight_holder.inserts(reviewer_weights.to_dict())
update_reader_weight(data, reader_weight_holder)
reporter_scores = get_reporter_score(data)
reporter_weighted_scores = get_reporter_score(data, weighted = True)
draw_reporter_scores(scale, data, reporters, reporter_scores, reporter_weighted_scores)
draw_user_weights(scale, reader_weight_holder.weights.values())
def run_simulate_unfair_judges(times, unfair_reader_percent = 0.4):
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
properties = {
'times' : times,
'scale' : scale,
'readers' : readers,
'unfair_readers' : get_unfair_readers(readers, unfair_reader_percent),
'reporters' : reporters,
'target_reporters' : get_target_reporters(reporters)
}
simulate_unfair_judges(**properties)
run_simulate_unfair_judges(10)
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def simulate_unfair_review_and_judge(times, scale, unfair_scores, readers, unfair_readers, reporters, target_reporters, drawing = True):
reader_weight_holder = ReaderWeightHolder(readers, scale.mean)
for i in range(times):
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
reviews = fill_unfair_reviews(reviews, readers, reporters, unfair_readers, unfair_scores, target_reporters)
judges = get_judges(scale, readers, reviews, news)
judges = fill_unfair_judges(judges, readers, reporters, unfair_readers, target_reporters, scale)
data = get_data(news, reviews, judges, reader_weight_holder)
reviewer_weights = get_reviewer_weights(data)
reader_weight_holder.inserts(reviewer_weights.to_dict())
update_reader_weight(data, reader_weight_holder)
if drawing:
reporter_scores = get_reporter_score(data)
reporter_weighted_scores = get_reporter_score(data, weighted = True)
draw_reporter_scores(scale, data, reporters, reporter_scores, reporter_weighted_scores)
draw_user_weights(scale, reader_weight_holder.weights.values())
return data, reporters
def run_simulate_unfair_review_and_judge(times, unfair_reader_percent = 0.4):
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
properties = {
'times' : times,
'scale' : scale,
'readers' : readers,
'unfair_scores' : get_unfair_scores(),
'unfair_readers' : get_unfair_readers(readers, unfair_reader_percent),
'reporters' : reporters,
'target_reporters' : get_target_reporters(reporters)
}
dummy1, dummy2 = simulate_unfair_review_and_judge(**properties)
run_simulate_unfair_review_and_judge(10)
# 移除不公正讀者的其他公正評分
def remove_fair_reviews(reviews, unfair_readers, target_reporters):
# 不公正讀者
grep_fair_reviews = reviews['reviewer_id'].isin(unfair_readers['reader_id'])
# 對其他報導者的公正評分
grep_fair_reviews &= ~reviews['reporter_id'].isin(target_reporters['reporter_id'])
return reviews.drop(reviews[grep_fair_reviews].index)
# 移除不公正讀者的其他公正評價
def remove_fair_judges(judges, unfair_readers, target_reporters):
# 不公正讀者
grep_fair_judges = judges['judger_id'].isin(unfair_readers['reader_id'])
# 對其他評分的公正評價
grep_fair_judges &= ~judges['reporter_id'].isin(target_reporters['reporter_id'])
return judges.drop(judges[grep_fair_judges].index)
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def simulate_extremely_unfair_review_and_judge(times, scale, unfair_scores, readers, unfair_readers, reporters, target_reporters, drawing = True):
reader_weight_holder = ReaderWeightHolder(readers, scale.mean)
for i in range(times):
news = get_news(num['news_per_reporter'], scale, reporters)
reviews = get_reviews(scale, readers, news)
reviews = fill_unfair_reviews(reviews, readers, reporters, unfair_readers, unfair_scores, target_reporters)
reviews = remove_fair_reviews(reviews, unfair_readers, target_reporters)
judges = get_judges(scale, readers, reviews, news)
judges = fill_unfair_judges(judges, readers, reporters, unfair_readers, target_reporters, scale)
judges = remove_fair_judges(judges, unfair_readers, target_reporters)
data = get_data(news, reviews, judges, reader_weight_holder)
reviewer_weights = get_reviewer_weights(data)
reader_weight_holder.inserts(reviewer_weights.to_dict())
update_reader_weight(data, reader_weight_holder)
if drawing:
reporter_scores = get_reporter_score(data)
reporter_weighted_scores = get_reporter_score(data, weighted = True)
draw_reporter_scores(scale, data, reporters, reporter_scores, reporter_weighted_scores)
draw_user_weights(scale, reader_weight_holder.weights.values())
return data, reporters
def run_simulate_extremely_unfair_review_and_judge(times, unfair_reader_percent = 0.4):
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'] , scale)
properties = {
'times' : times,
'scale' : scale,
'readers' : readers,
'unfair_scores' : get_unfair_scores(),
'unfair_readers' : get_unfair_readers(readers, unfair_reader_percent),
'reporters' : reporters,
'target_reporters' : get_target_reporters(reporters)
}
dummy1, dummy2 = simulate_extremely_unfair_review_and_judge(**properties)
run_simulate_extremely_unfair_review_and_judge(10)
import matplotlib.ticker as ticker
%run src/scale.py
%run src/helper.py
%run src/reader_weight_holder.py
def simulate_unfair_reader_percents(times, unfair_reader_percents):
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'], scale)
results = pd.DataFrame(columns = ['percent', 'simple_avg', 'weight_avg'])
for percent in unfair_reader_percents:
properties = {
'times' : times,
'scale' : scale,
'readers' : readers,
'unfair_scores' : get_unfair_scores(),
'unfair_readers' : get_unfair_readers(readers, percent),
'reporters' : reporters,
'target_reporters' : get_target_reporters(reporters),
'drawing' : False
}
data, reporters = simulate_extremely_unfair_review_and_judge(**properties)
data = data[data.reporter_id == 0]
# 評分直接計算
reporter_scores = get_reporter_score(data)
# 評分*評分者可信度計算
reporter_scores_weighted = get_reporter_score(data, weighted = True)
result = {
'percent': percent,
'simple_avg': reporter_scores[0],
'weight_avg': reporter_scores_weighted[0],
'reporter_score': reporters.iloc[0].reporter_score
}
results = results.append(result, ignore_index = True)
return results, scale
def run_simulate_unfair_reader_percents():
unfair_reader_percents = [0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8]
results, scale = simulate_unfair_reader_percents(10, unfair_reader_percents)
results.set_index('percent', inplace = True)
plt.plot(results['simple_avg'], label = 'simple avg')
plt.plot(results['weight_avg'], label = 'weighted avg')
plt.plot(results['reporter_score'], 'k--', label = 'origin', c = '0.55')
plt.gca().xaxis.set_major_formatter(ticker.PercentFormatter(xmax = 1))
plt.ylim(scale.min, scale.max)
plt.xlabel("Percents")
plt.ylabel("Reporters[0]'s scores")
plt.title("Different percents of unfair readers")
plt.legend(loc = 'upper left')
plt.show()
run_simulate_unfair_reader_percents()
def simulate_unfair_reader_times(times, unfair_reader_percent = 0.4):
scale = Scale()
readers = get_readers(num['reader'])
reporters = get_reporters(num['reporter'], scale)
results = pd.DataFrame(columns = ['time', 'simple_avg', 'weight_avg'])
for time in times:
properties = {
'times' : time,
'scale' : scale,
'readers' : readers,
'unfair_scores' : get_unfair_scores(),
'unfair_readers' : get_unfair_readers(readers, unfair_reader_percent),
'reporters' : reporters,
'target_reporters' : get_target_reporters(reporters),
'drawing' : False
}
data, reporters = simulate_extremely_unfair_review_and_judge(**properties)
data = data[data.reporter_id == 0]
# 評分直接計算
reporter_scores = get_reporter_score(data)
# 評分*評分者可信度計算
reporter_scores_weighted = get_reporter_score(data, weighted = True)
result = {
'time': time,
'simple_avg': reporter_scores[0],
'weight_avg': reporter_scores_weighted[0],
'reporter_score': reporters.iloc[0].reporter_score
}
results = results.append(result, ignore_index = True)
return results, scale
def run_simulate_unfair_reader_times():
times = [5, 10, 20, 30, 50]
results, scale = simulate_unfair_reader_times(times)
results.set_index('time', inplace = True)
plt.plot(results['simple_avg'], label = 'simple avg')
plt.plot(results['weight_avg'], label = 'weighted avg')
plt.plot(results['reporter_score'], 'k--', label = 'origin', c = '0.55')
plt.ylabel("Reporters[0]'s scores")
plt.title("Different times of simulates")
plt.ylim(scale.min, scale.max)
plt.legend(loc = 'upper left')
plt.show()
run_simulate_unfair_reader_times()