5 changed files with 1949 additions and 0 deletions
@ -0,0 +1,5 @@
@@ -0,0 +1,5 @@
|
||||
.idea |
||||
.ipynb_checkpoints |
||||
__pycache__ |
||||
*.pyc |
||||
ghostdriver.log |
@ -0,0 +1,211 @@
@@ -0,0 +1,211 @@
|
||||
{ |
||||
"cells": [ |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": 1, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"import selenium\n", |
||||
"import time\n", |
||||
"import datetime\n", |
||||
"from selenium import webdriver\n", |
||||
"from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n", |
||||
"from bs4 import BeautifulSoup\n", |
||||
"\n", |
||||
"from IPython.display import display, Image, HTML\n", |
||||
"\n", |
||||
"from jupyter_progressbar import ProgressBar\n", |
||||
"import json" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": 3, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"def remove_kickstarter_url_prefix(url):\n", |
||||
" if url.startswith('https://www.kickstarter.com/'):\n", |
||||
" return url[len('https://www.kickstarter.com'):]\n", |
||||
" return url" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": 7, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"driver = webdriver.Chrome()\n", |
||||
"\n", |
||||
"root = 'https://www.kickstarter.com/'\n", |
||||
"driver.get(root)\n", |
||||
"\n", |
||||
"discover_links = {\n", |
||||
" link\n", |
||||
" for link in driver.find_elements_by_tag_name('a')\n", |
||||
" for link in [link.get_attribute('href')]\n", |
||||
" for link in [remove_kickstarter_url_prefix(link)]\n", |
||||
" if link.startswith(\"/discover/\")\n", |
||||
"}\n", |
||||
"\n", |
||||
"driver.close()\n", |
||||
"driver.quit()" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": 26, |
||||
"metadata": {}, |
||||
"outputs": [ |
||||
{ |
||||
"name": "stdout", |
||||
"output_type": "stream", |
||||
"text": [ |
||||
"Request-sent\n" |
||||
] |
||||
} |
||||
], |
||||
"source": [ |
||||
"try:\n", |
||||
" driver.close()\n", |
||||
" driver.quit()\n", |
||||
"except Exception as e:\n", |
||||
" print(e)\n", |
||||
"\n", |
||||
"driver = webdriver.Chrome()" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": 27, |
||||
"metadata": { |
||||
"scrolled": false |
||||
}, |
||||
"outputs": [ |
||||
{ |
||||
"data": { |
||||
"application/vnd.jupyter.widget-view+json": { |
||||
"model_id": "990f33c96fef49aebb4caaf7df72e20f", |
||||
"version_major": 2, |
||||
"version_minor": 0 |
||||
}, |
||||
"text/html": [ |
||||
"<p>Failed to display Jupyter Widget of type <code>VBox</code>.</p>\n", |
||||
"<p>\n", |
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", |
||||
" that the widgets JavaScript is still loading. If this message persists, it\n", |
||||
" likely means that the widgets JavaScript library is either not installed or\n", |
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n", |
||||
" Widgets Documentation</a> for setup instructions.\n", |
||||
"</p>\n", |
||||
"<p>\n", |
||||
" If you're reading this message in another frontend (for example, a static\n", |
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n", |
||||
" it may mean that your frontend doesn't currently support widgets.\n", |
||||
"</p>\n" |
||||
], |
||||
"text/plain": [ |
||||
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0%'))), HTML(value='<b>0</b>% or <b>0</b> of <b>0</b> done', placeholder='0%')))" |
||||
] |
||||
}, |
||||
"metadata": {}, |
||||
"output_type": "display_data" |
||||
}, |
||||
{ |
||||
"ename": "KeyboardInterrupt", |
||||
"evalue": "", |
||||
"output_type": "error", |
||||
"traceback": [ |
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", |
||||
"\u001b[0;32m<ipython-input-27-9f1242042551>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdiscover_link\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdiscover_links\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdiscover_link\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mproject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mProgressBar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_all_projects\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0mprojects\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
||||
"\u001b[0;32m~/.virtualenvs/kickstarter/lib/python3.5/site-packages/jupyter_progressbar/__init__.py\u001b[0m in \u001b[0;36mProgressBar\u001b[0;34m(iter, size)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mtsq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
||||
"\u001b[0;32m<ipython-input-27-9f1242042551>\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mn_wait\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " |
||||
] |
||||
} |
||||
], |
||||
"source": [ |
||||
"projects = dict()\n", |
||||
"\n", |
||||
"class get_all_projects:\n", |
||||
" def __init__(self, driver):\n", |
||||
" self.driver = driver\n", |
||||
" self.total_comments = next(\n", |
||||
" int(element.text.replace(' projects', '').replace(',', ''))\n", |
||||
" for element in driver.find_elements_by_class_name('count')\n", |
||||
" if element.text.endswith(' projects')\n", |
||||
" )\n", |
||||
" \n", |
||||
" def __iter__(self):\n", |
||||
" done = set()\n", |
||||
" driver.execute_script(\"$('.load_more > a').click()\")\n", |
||||
" n_wait = 0\n", |
||||
" \n", |
||||
" while driver.execute_script(\"return $('.load_more > a').length\") > 0:\n", |
||||
" n_wait += 1\n", |
||||
" n_projects = driver.execute_script(\"return $('*[data-project]').length\")\n", |
||||
" if n_projects > 0 or n_wait > 5:\n", |
||||
" driver.execute_script(\"$('.load_more > a').click()\")\n", |
||||
" \n", |
||||
" for item in driver.find_elements_by_css_selector('*[data-project]'):\n", |
||||
" project = json.loads(item.get_attribute('data-project'))\n", |
||||
" if project['id'] not in done:\n", |
||||
" done.add(project['id'])\n", |
||||
" driver.execute_script('$(\"*[data-project_pid=%d]\").parent().remove()' % project['id'])\n", |
||||
" yield project\n", |
||||
" n_wait = 0\n", |
||||
" time.sleep(0.5)\n", |
||||
" \n", |
||||
" def __len__(self):\n", |
||||
" return self.total_comments\n", |
||||
" \n", |
||||
"for discover_link in discover_links:\n", |
||||
" driver.get(root + discover_link)\n", |
||||
" for project in ProgressBar(get_all_projects(driver)):\n", |
||||
" projects[project['id']] = project\n", |
||||
" break" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": 121, |
||||
"metadata": {}, |
||||
"outputs": [ |
||||
{ |
||||
"name": "stdout", |
||||
"output_type": "stream", |
||||
"text": [ |
||||
"https://www.kickstarter.com//discover/newest?ref=discovery_overlay\n" |
||||
] |
||||
} |
||||
], |
||||
"source": [ |
||||
"print(root + discover_link)\n" |
||||
] |
||||
} |
||||
], |
||||
"metadata": { |
||||
"kernelspec": { |
||||
"display_name": "Python 3", |
||||
"language": "python", |
||||
"name": "python3" |
||||
}, |
||||
"language_info": { |
||||
"codemirror_mode": { |
||||
"name": "ipython", |
||||
"version": 3 |
||||
}, |
||||
"file_extension": ".py", |
||||
"mimetype": "text/x-python", |
||||
"name": "python", |
||||
"nbconvert_exporter": "python", |
||||
"pygments_lexer": "ipython3", |
||||
"version": "3.5.2" |
||||
} |
||||
}, |
||||
"nbformat": 4, |
||||
"nbformat_minor": 2 |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,98 @@
@@ -0,0 +1,98 @@
|
||||
import base64 |
||||
|
||||
import scrapy |
||||
from scrapy_splash import SplashRequest |
||||
|
||||
|
||||
class ExploreSpider(scrapy.Spider): |
||||
name = 'explorespider' |
||||
start_urls = ['https://www.kickstarter.com/'] |
||||
|
||||
def start_requests(self): |
||||
for url in self.start_urls: |
||||
yield SplashRequest(url, self.parse_explore) |
||||
|
||||
def parse_explore(self, response): |
||||
for link in response.xpath('//a/@href').extract(): |
||||
if link.startswith('https://www.kickstarter.com/'): |
||||
link = link[len('https://www.kickstarter.com'):] |
||||
if link.startswith("/discover/"): |
||||
yield SplashRequest( |
||||
'https://www.kickstarter.com' + link, |
||||
self.parse_discover, |
||||
endpoint='execute', |
||||
args={'lua_source': """ |
||||
function main(splash) |
||||
assert(splash:go(splash.args.url)) |
||||
assert(splash:wait(1)) |
||||
local n_comments = -1 |
||||
|
||||
splash:runjs("$('.load_more > a').click()") |
||||
assert(splash:wait(8)) |
||||
splash:runjs("$('.load_more > a').click()") |
||||
assert(splash:wait(8)) |
||||
|
||||
""" |
||||
|
||||
# while splash:evaljs("$('.load_more > a:visible').length") > 0 do |
||||
# if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then |
||||
# splash:runjs("$('.load_more > a').click()") |
||||
# end |
||||
# n_comments = splash:evaljs("$('*[data-pid]').length") |
||||
# assert(splash:wait(0.5)) |
||||
# break |
||||
# end |
||||
""" |
||||
return { |
||||
n0 = splash:evaljs("$('.load_more > a:visible').length"), |
||||
n1 = splash:evaljs("$('.load_more > a:visible').length") > 0, |
||||
m = splash:evaljs("$('*[data-pid]').length"), |
||||
html = splash:html(), |
||||
} |
||||
end |
||||
"""} |
||||
) |
||||
return |
||||
|
||||
def parse_discover(self, response): |
||||
print('*' * 60) |
||||
# print(response.data.keys()) |
||||
print({k:v for k,v in response.data.items() if k != 'html'}) |
||||
print('*' * 60) |
||||
|
||||
return |
||||
urls = set() |
||||
for link in response.xpath('//a/@href').extract(): |
||||
if link.startswith('https://www.kickstarter.com/'): |
||||
link = link[len('https://www.kickstarter.com'):] |
||||
if link.startswith("/projects/"): |
||||
urls.add('https://www.kickstarter.com' + link) |
||||
yield SplashRequest( |
||||
'https://www.kickstarter.com' + link, |
||||
self.parse_project, |
||||
args={'lua_source': """ |
||||
function main(splash) |
||||
assert(splash:go(splash.args.url)) |
||||
assert(splash:wait(1)) |
||||
|
||||
|
||||
|
||||
while splash:evaljs("$('.older_comments:visible').length") > 0 do |
||||
print(splash:evaljs("$('.older_comments:visible').length")) |
||||
if (splash:evaljs("$('li.comments').length") ~= n_comments) then |
||||
splash:runjs("$('.older_comments').click()") |
||||
end |
||||
n_comments = splash:evaljs("$('li.comments').length") |
||||
assert(splash:wait(0.5)) |
||||
end |
||||
return { |
||||
html = splash:html(), |
||||
} |
||||
end |
||||
"""} |
||||
) |
||||
|
||||
print('*'*20, response.url, len(urls), urls) |
||||
|
||||
def parse_project(self, response): |
||||
print(response.url) |
@ -0,0 +1,15 @@
@@ -0,0 +1,15 @@
|
||||
DOWNLOADER_MIDDLEWARES = { |
||||
'scrapy_splash.SplashCookiesMiddleware': 723, |
||||
'scrapy_splash.SplashMiddleware': 725, |
||||
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, |
||||
} |
||||
|
||||
SPLASH_URL = 'http://localhost:8050/' |
||||
|
||||
SPIDER_MIDDLEWARES = { |
||||
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, |
||||
} |
||||
|
||||
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' |
||||
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' |
||||
|
Loading…
Reference in new issue