Skip to content

Instantly share code, notes, and snippets.

@sripathikrishnan
Last active March 1, 2018 03:42
Show Gist options
  • Select an option

  • Save sripathikrishnan/5b487b4def977e4e4d7e46073a69a149 to your computer and use it in GitHub Desktop.

Select an option

Save sripathikrishnan/5b487b4def977e4e4d7e46073a69a149 to your computer and use it in GitHub Desktop.
Answers to AskUbuntu Data Warehouse Assignment
--Top 10 users by reputation
select id, DisplayName, Reputation
from Users order by Reputation desc limit 10;
--Top 10 questions asked by user Mitch
select p.Title, p.Score, u.DisplayName
from Posts p inner join Users u on p.OwnerUserId = u.Id
and p.PostTypeId = 1
where u.DisplayName = 'Mitch'
order by p.Score desc;
-- Most popular badges, sorted by number of users who have this badge
select b.Name, count(*)
from Badges b inner join UserBadges ub on b.id = ub.BadgeId
group by b.Name
order by 2 desc
limit 10;
-- For users with reputation > 75000, print Userid, DisplayName,
-- Reputation and total number of questions they have asked
select u.Id, u.DisplayName, u.Reputation, count(*) as QuestionsAsked
from Users u inner join Posts p on u.id = p.OwnerUserId and p.PostTypeId = 1
where u.reputation > 75000
group by u.Id, u.DisplayName, u.Reputation
order by QuestionsAsked desc
limit 10;
-- Print userid, user name and reputation for users
-- whose reputation is higher than the user 'Takkat'
select u.Id, u.DisplayName, u.Reputation
from Users u, Users u2
where u2.DisplayName = 'Takkat'
and u.Reputation > u2.Reputation
order by u.Reputation desc;
-- Return a list of users (id, displayname, reputation) that:
-- * Have a reputation greater than 1000
-- * Have received a score of 10 or more on at least one of their comments
-- * Have at least one of the badges "Nice Answer", "Good Answer", "Great Answer"
-- Sort the users by reputation in descending order
with UsersWithAtleastOnePopularComment as (
select u.Id as UserId from Users u
where exists (
select 'x' from comments c1
where c1.UserId = u.Id
and c1.score > 10
)
),
UsersWithBadges as (
select u.Id as UserId from Users u
where exists (
select 'x' from UserBadges ub inner join Badges b on ub.BadgeId = b.Id
where ub.UserId = u.Id and b.Name in ('Nice Answer', 'Good Answer', 'Great Answer')
)
)
select u.Id, u.DisplayName, u.Reputation
from Users u inner join UsersWithAtleastOnePopularComment up on u.Id = up.UserId
inner join UsersWithBadges ub on u.Id = ub.UserId
where u.Reputation > 1000
order by u.Reputation desc;
-- To verify the above query
-- List all comments for a user whose score is > 10
select c.score, c.CommentText from Comments c
where c.score > 10 and c.UserId = 1059;
-- List all badges for user
select ub.UserId, b.Name from Badges b inner join UserBadges ub on b.Id = ub.BadgeId
where ub.UserId = 1059 and b.Name in ('Nice Answer', 'Good Answer', 'Great Answer');
-- Find Duplicate Comments
-- Duplicate is defined as same user posting
-- exact same comment on the same post within 1 minute of each other
select c.Id, c.PostId, u.DisplayName, c.CommentText
from Comments c inner join Users u on c.UserId = u.Id
where exists (
select 'x' from Comments c1
where c1.Id <> c.Id and c1.UserId = c.UserId and c1.PostId = c.PostId
and c1.CommentText = c.CommentText
and extract(epoch from (c1.CreationDate - c.CreationDate)) < 60
);
-- CSV
-- TODO: Not optimum, dislike the distinct
with Answerers as (
select q.Id as QuestionId, array_agg(au.DisplayName) as Answerers
from Posts q inner join Posts a
on q.Id = a.ParentId and q.PostTypeId = 1 and a.PostTypeId = 2
inner join Users au on a.OwnerUserId = au.Id
group by q.Id
),
QuestionCommenters as (
select q.Id as QuestionId, array_agg(distinct cu.DisplayName) as QuestionCommenters
from Posts q inner join Comments c on q.Id = c.PostId and q.PostTypeId = 1
inner join Users cu on c.Userid = cu.id
group by q.Id
),
AnswerCommenters as (
select q.Id as QuestionId, array_agg(distinct u.DisplayName) as AnswerCommenters
from Posts q inner join Posts a
on q.Id = a.ParentId and q.PostTypeId = 1 and a.PostTypeId = 2
inner join Comments c on a.Id = c.PostId
inner join Users u on c.UserId = u.Id
group by q.Id
)
select p.Id, p.Title, a.Answerers, qc.QuestionCommenters, ac.AnswerCommenters
from Posts p inner join Answerers a on p.Id = a.QuestionId
inner join QuestionCommenters qc on p.Id = qc.QuestionId
inner join AnswerCommenters ac on p.Id = ac.QuestionId
where p.PostTypeId = 1 and p.OwnerUserId = 10;
--- Corelated update
update Users
set QuestionsAsked = t.QuestionsAsked
from
(select p.OwnerUserId as UserId, count(*) as QuestionsAsked from Posts p where p.PostTypeId = 1 and p.OwnerUserId is not null group by p.OwnerUserId) t
where id = t.UserId;
-- Alternative way to write the same query
with qa as (
select p.OwnerUserId as UserId, count(*) as QuestionsAsked from Posts p where p.PostTypeId = 1 and p.OwnerUserId is not null group by p.OwnerUserId)
update Users
set QuestionsAsked = qa.QuestionsAsked
from qa
where id = qa.UserId;
-- New Tag Score Algorithm for User
with AnswerTags as (
select a.Id as AnswerId, t.TagName as TagName
from Posts a inner join Posts q
on a.ParentId = q.Id and a.PostTypeId = 2 and q.PostTypeId = 1
inner join PostTags qt on q.id = qt.PostId
inner join Tags t on qt.tagid = t.id
),
AnswerNormalizedScore as (
select a.Id as AnswerId, a.score / count(*) as Score
from Posts a inner join Posts q
on a.ParentId = q.Id and a.PostTypeId = 2 and q.PostTypeId = 1
inner join PostTags qt on q.id = qt.PostId
group by a.Id, a.score
)
select u.DisplayName, at.TagName, sum(norm.Score)
from Posts p inner join Users u on p.OwnerUserId = u.Id and p.PostTypeId = 2
inner join AnswerTags at on at.AnswerId = p.Id
inner join AnswerNormalizedScore norm on norm.AnswerId = p.Id
where u.Id = 10
group by u.DisplayName, at.TagName
order by 3 desc;
-- People who have updated the title of their posts more than 5 times
-- To verify your output, replace postid and then visit
-- https://askubuntu.com/posts/<postid>/revisions
with IndecisiveUsers as (
select ph.PostId as PostId, ph.UserId as UserId, count(*) NumberOfEdits
from PostHistory ph
where ph.PostHistoryTypeId = 4
group by ph.PostId, ph.UserId
having count(*) > 5
)
select p.Id as PostId, p.OwnerUserId, u.DisplayName, p.Title
from Posts p
inner join Users u on p.OwnerUserId = u.Id
inner join IndecisiveUsers iu on p.OwnerUserId = iu.Userid and p.Id = iu.PostId;
-- Indexes
-- TODO: Haven't reviewed the performance of queries wrt these indexes
create index indx_users_reputation on users(reputation);
create index indx_users_displayname on users(displayname);
create index indx_user_badge on userbadges(userid, badgeid);
create index indx_post_owner_user on Posts(owneruserid, posttypeid);
create index indx_comment_userid on Comments(userid);
create index indx_badge_name on Badges(name);
create index indx_comment_user_post on Comments(postid, userid);
create index indx_posthistory on Posthistory(postid);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment