Skip to content

Instantly share code, notes, and snippets.

@psyguy
Last active May 7, 2020 09:46
Show Gist options
  • Select an option

  • Save psyguy/562b30533d26a813a9ece5cd2de4e1d6 to your computer and use it in GitHub Desktop.

Select an option

Save psyguy/562b30533d26a813a9ece5cd2de4e1d6 to your computer and use it in GitHub Desktop.
بسامد 50 اسم پرتکرار نوزادان ایرانی در سال‌های 1380-1396 (2000-2017) برگرفته از سایت سازمان ثبت احوال کشور
#####----------------------------------------------------------------------------
##
## Reading and cleaning 50 most popular child names in Iran, 2000-2017.
##
## Data taken off Iran's National Organization for Civil Registration website:
## https://www.sabteahval.ir/dtfe/default.aspx?tabid=1383
##
## Acknoledging this contribution (by MH Manuel Haqiqatkhah) is appreciated.
## In case of questions regarding the code, drop me a line on Twitter: @_psyguy.
##
#####----------------------------------------------------------------------------
library(tidyverse)
library(rvest)
library(readr)
rm(list = ls())
p_80_93 <- c(1380:1391,92,1393) %>%
paste0("https://www.sabteahval.ir/Upload/Modules/Contents/asset101/name/p",.,".htm")
d_80_93 <- c(1380:1391,92,1393) %>%
paste0("https://www.sabteahval.ir/Upload/Modules/Contents/asset101/name/d",.,".htm")
a_94_96 <- (1394:1396) %>%
paste0("https://www.sabteahval.ir/Upload/Modules/Contents/asset101/name/dp",.,".htm")
dp.read <- function(the.url){
file <- read_html(iconv(the.url, to = "UTF-8"))
tables <- html_nodes(file, "table")
out <- html_table(tables[1], fill = TRUE)[[1]]
print(the.url)
out %>% return()
}
dp.clean <- function(inp.t, gender){
t.1 <- inp.t[,1:3]
t.2 <- inp.t[,4:6]
colnames(t.2) <- colnames(t.1) <- c("rank", "name", "count")
t <- rbind(t.1, t.2) %>%
filter(rank %in% 1:50) %>%
cbind(gender)
t$count <- t$count %>% as.numeric()
t$rank <- t$rank %>% as.numeric()
t %>% return()
}
a.clean <- function(inp.t){
t1 <- inp.t[3:52,3:4]
t2 <- inp.t[3:52,5:6]
colnames(t2) <- colnames(t1) <- c("name", "count")
t <- rbind(t1, t2) %>%
cbind(gender = rep(c("d","p"), each = 50))
t$count <- t$count %>% as.numeric()
t <- cbind(rank = rep(c(1:50), 2), t)
t$rank <- t$rank %>% as.numeric()
t %>% return()
}
t.p <- p_80_93 %>%
map(dp.read) %>%
map(dp.clean, gender = "p")
names(t.p) <- 1380:1393
t.d <- d_80_93 %>%
map(dp.read) %>%
map(dp.clean, gender = "d")
names(t.d) <- 1380:1393
t.a <- a_94_96 %>%
map(dp.read) %>%
map(a.clean)
names(t.a) <- 1394:1396
t.dp <- map2(t.p, t.d, rbind)
t.tot <- t.dp %>% append(t.a)
t.tot.df <- t.tot %>% bind_rows(.id = "year")
write_excel_csv2(t.tot.df,
"Popular names 2000-2017.csv",
delim = ",")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment