码迷,mamicode.com
首页 > 其他好文 > 详细

NIPS2020 论文下载 代码

时间:2021-01-12 10:46:54      阅读:0      评论:0      收藏:0      [点我收藏+]

标签:search   sorted   name   head   content   paper   dex   list()   cal   

  1 # %% NIPS 2020 论文信息下载
  2 import json
  3 import os
  4 import re
  5 
  6 import pandas as pd
  7 import requests
  8 import tqdm
  9 from bs4 import BeautifulSoup
 10 
 11 
 12 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 13 
 14 # %%
 15 PAPER_HASH_PATTERN = re.compile(rposter_(?P<UID>\w+)\.html)
 16 SESSION_PATTERN = re.compile(rOrals & Spotlights Track \d+:\s*(?P<session>[^;]*))
 17 
 18 
 19 def cleanup_string(s):
 20     s = s.strip()
 21     while    in s:
 22         s = s.replace(  ,  )
 23     return s
 24 
 25 
 26 def download_file(download_url, file_name=None):
 27     if file_name is None:
 28         file_name = os.path.basename(download_url)
 29     response = requests.get(download_url, stream=True)
 30     total = int(response.headers.get(Content-Length))
 31     pbar = None
 32     if total is not None:
 33         pbar = tqdm.tqdm(desc=fDownloading from {download_url} to {file_name},
 34                          total=total, unit=B, unit_scale=True, unit_divisor=1000)
 35     with open(file_name, wb) as file:
 36         for chunk in response.iter_content(chunk_size=10240):
 37             if chunk:
 38                 file.write(chunk)
 39             if pbar is not None:
 40                 pbar.update(len(chunk))
 41 
 42 
 43 # %%
 44 # download paper list
 45 if not os.path.exists(papers.json):
 46     download_file(https://neurips.cc/virtual/2020/public/papers.json, file_name=papers.json)
 47 
 48 # %%
 49 # get oral paper list
 50 oral_papers = set()
 51 response = requests.get(https://neurips.cc/virtual/2020/public/f_orals.html)
 52 soup = BeautifulSoup(response.text, html.parser)
 53 for tag in soup.find_all(a, href=PAPER_HASH_PATTERN):
 54     href = tag[href]
 55     UID = PAPER_HASH_PATTERN.search(href).group(UID)
 56     oral_papers.add(UID)
 57 
 58 # %%
 59 # process paper list
 60 with open(papers.json, mode=r) as file:
 61     data = json.load(file)
 62 
 63 df = pd.DataFrame(columns=[ID, Category, Title, Authors, Keywords, Sessions, URL, Proceedings URL, PDF URL, UID])
 64 for i, paper in enumerate(tqdm.tqdm(data)):
 65     if paper[eventtype] != Poster:
 66         continue
 67 
 68     UID = paper[UID]
 69     category = Poster
 70     sessions = ; .join(paper[sessions])
 71     sessions = ; .join([match.group(session) for match in SESSION_PATTERN.finditer(sessions)])
 72     sessions = cleanup_string(sessions)
 73     if sessions != ‘‘:
 74         category = Spotlight
 75     if UID in oral_papers:
 76         category = Oral
 77 
 78     keywords = set()
 79     for keyword in (; .join(paper[keywords])).split(; ):
 80         keyword = cleanup_string(keyword)
 81         if keyword != ‘‘:
 82             keywords.add(keyword)
 83     keywords = \n.join(sorted(keywords))
 84 
 85     paper = {
 86         ID: paper[id],
 87         Category: category,
 88         Title: cleanup_string(paper[title]),
 89         Authors: cleanup_string(, .join(paper[authors])),
 90         Keywords: keywords,
 91         Sessions: sessions,
 92         URL: fhttps://neurips.cc/virtual/2020/public/poster_{UID}.html,
 93         Proceedings URL: paper[paper_pdf_url],
 94         PDF URL: fhttps://proceedings.neurips.cc/paper/2020/file/{UID}-Paper.pdf,
 95         UID: UID
 96     }
 97     df.loc[len(df)] = paper
 98 
 99 df[Category] = pd.Categorical(df[Category], categories=[Oral, Spotlight, Poster])
100 df.sort_values(by=[Category, Sessions, Keywords], inplace=True)
101 df.to_csv(paper_list.csv, index=False)
102 
103 # %%
104 # get paper details
105 all_subject_areas = set()
106 for i, paper in enumerate(tqdm.tqdm(df.iloc, total=len(df))):
107     if paper[Keywords] == ‘‘:
108         continue
109     areas = set(paper[Keywords].split(\n))
110     all_subject_areas.update(areas)
111 
112 try:
113     all_subject_areas.remove(‘‘)
114 except KeyError:
115     pass
116 
117 df = df.reindex(columns=df.columns.to_list() + sorted(all_subject_areas))
118 for i, paper in enumerate(df.iloc):
119     for area in paper[Keywords].split(\n):
120         if area != ‘‘:
121             df[area][i] = Y
122 
123 df.to_csv(NeuraIPS Papers.csv, index=False)

 

NIPS2020 论文下载 代码

标签:search   sorted   name   head   content   paper   dex   list()   cal   

原文地址:https://www.cnblogs.com/imoon22/p/14255581.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!