跳转至

My Blog

Pixy 计算种群内和种群间的核苷酸多样性工具

pixy用于从vcf变异文件中计算pi（π）,Dxy和Fst的值。

1.使用conda安装pixy github

conda install -c conda-forge pixy
conda install -c bioconda htslib

2.用法示例官方文档

一键计算pi,fst,dxy

pixy --stats pi fst dxy \
--vcf data/vcf/ag1000/chrX_36Ag_allsites.vcf.gz \
--populations Ag1000_sampleIDs_popfile.txt \
--window_size 10000 \
--n_cores 8

通过指定bed来计算

pixy --stats pi fst dxy \
--vcf data/vcf/ag1000/chrX_36Ag_allsites.vcf.gz \
--populations Ag1000_sampleIDs_popfile.txt \
--bed_file genomic_windows.bed

参数 --chromosomes可以指定染色体，有多条染色体使用逗号分割--chromosomes Chr1,Chr3,Chr10 --population Ag1000_sampleIDs_popfile.txt 文件的格式如下：

ERS223827   BFS
ERS223759   BFS
ERS223750   BFS
ERS223967   AFS
ERS223970   AFS
ERS223924   AFS
ERS224300   AFS
ERS224168   KES
ERS224314   KES

第一列是snp中的样本的名称，第二列是分组信息，中间用制表符分割。 3.可视化使用ggplot2

pixy_to_long <- function(pixy_files){

  pixy_df <- list()

  for(i in 1:length(pixy_files)){

    stat_file_type <- gsub(".*_|.txt", "", pixy_files[i])

    if(stat_file_type == "pi"){

      df <- read_delim(pixy_files[i], delim = "\t")
      df <- df %>%
        gather(-pop, -window_pos_1, -window_pos_2, -chromosome,
               key = "statistic", value = "value") %>%
        rename(pop1 = pop) %>%
        mutate(pop2 = NA)

      pixy_df[[i]] <- df


    } else{

      df <- read_delim(pixy_files[i], delim = "\t")
      df <- df %>%
        gather(-pop1, -pop2, -window_pos_1, -window_pos_2, -chromosome,
               key = "statistic", value = "value")
      pixy_df[[i]] <- df

    }

  }

  bind_rows(pixy_df) %>%
    arrange(pop1, pop2, chromosome, window_pos_1, statistic)

}

绘制在所有染色体级别的分布图

# create a custom labeller for special characters in pi/dxy/fst
pixy_labeller <- as_labeller(c(avg_pi = "pi",
                             avg_dxy = "D[XY]",
                             avg_wc_fst = "F[ST]"),
                             default = label_parsed)

# plotting summary statistics across all chromosomes
pixy_df %>%
  mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even",
                                 chromosome == "X" ~ "even",
                                 TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = c(1:22, "X", "Y"))) %>%
  filter(statistic %in% c("avg_pi", "avg_dxy", "avg_wc_fst")) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = chrom_color_group))+
  geom_point(size = 0.5, alpha = 0.5, stroke = 0)+
  facet_grid(statistic ~ chromosome,
             scales = "free_y", switch = "x", space = "free_x",
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
  xlab("Chromsome")+
  ylab("Statistic Value")+
  scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(0.1, "cm"),
        strip.background = element_blank(),
        strip.placement = "outside",
        legend.position ="none")+
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))

回到页面顶部