Skip to content

Commit 8024f03

Browse files
committed
feat(website): blog post on handlers
1 parent ac284ca commit 8024f03

File tree

12 files changed

+299
-38
lines changed

12 files changed

+299
-38
lines changed

.github/workflows/lint-and-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ jobs:
116116
SA_REPLAY_SKIP_BINARY_DOWNLOAD: 1
117117

118118
- name: Run tests
119-
run: yarn jest --testTimeout=60000 --runInBand
119+
run: yarn jest --testTimeout=60000 --runInBand --detectOpenHandles
120120
working-directory: ./build
121121
env:
122122
SA_SHOW_REPLAY: false

core/lib/CoreServerConnection.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ export default class CoreServerConnection extends TypedEventEmitter<{
1919
message: ICoreResponsePayload | ICoreEventPayload;
2020
}> {
2121
public isClosing = false;
22-
public isPersistent = false;
22+
public isPersistent = true;
2323
public autoShutdownMillis = 500;
2424

2525
private autoShutdownTimer: NodeJS.Timer;
@@ -78,7 +78,7 @@ export default class CoreServerConnection extends TypedEventEmitter<{
7878
public async connect(
7979
options: ICoreConfigureOptions & { isPersistent?: boolean } = {},
8080
): Promise<{ maxConcurrency: number; browserEmulatorIds: string[] }> {
81-
this.isPersistent = options.isPersistent ?? false;
81+
this.isPersistent = options.isPersistent ?? true;
8282
this.isClosing = false;
8383
await Core.start(options, false);
8484
return {

core/test/basic.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,11 @@ describe('basic Core tests', () => {
3131
await Core.shutdown();
3232
});
3333

34-
it('shuts down if connect not called manually and Core.start not called', async () => {
34+
it('shuts down if connect set to be not persistent and Core.start not called', async () => {
3535
shutdownSpy.mockClear();
3636

3737
const connection = Core.addConnection();
38+
await connection.connect({ isPersistent: false });
3839
Helpers.onClose(() => connection.disconnect());
3940
const connectionCloseSpy = jest.spyOn(connection, 'disconnect');
4041
connection.autoShutdownMillis = 0;

core/test/remote.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,11 @@ describe('basic remote connection tests', () => {
6666
expect(sessionId).toBeTruthy();
6767

6868
const { url } = httpServer;
69-
await agent.goto(url);
69+
await customAgent.goto(url);
7070

71-
const html = await agent.document.documentElement.outerHTML;
71+
const html = await customAgent.document.documentElement.outerHTML;
7272
expect(html).toBe('<html><head></head><body>Hello world</body></html>');
7373

74-
await agent.close();
74+
await customAgent.close();
7575
});
7676
});

core/test/user-profile.test.ts

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ let connection: CoreServerConnection;
1313
beforeAll(async () => {
1414
connection = Core.addConnection();
1515
await connection.connect();
16+
Helpers.onClose(() => connection.disconnect(), true);
1617
koaServer = await Helpers.runKoaServer();
1718
});
1819
afterAll(Helpers.afterAll);
@@ -174,7 +175,7 @@ describe('UserProfile cookie tests', () => {
174175
describe('UserProfile Dom storage tests', () => {
175176
it('should be able to save and restore local/session storage', async () => {
176177
const meta = await connection.createSession();
177-
const core = Session.getTab(meta);
178+
const tab = Session.getTab(meta);
178179

179180
koaServer.get('/local', ctx => {
180181
ctx.body = `<body>
@@ -211,8 +212,8 @@ document.querySelector('#session').innerHTML = [session1,session2,session3].join
211212
</body>`;
212213
});
213214

214-
await core.goto(`${koaServer.baseUrl}/local`);
215-
await core.waitForLoad('AllContentLoaded');
215+
await tab.goto(`${koaServer.baseUrl}/local`);
216+
await tab.waitForLoad('AllContentLoaded');
216217

217218
const profile = await connection.exportUserProfile(meta);
218219
expect(profile.cookies).toHaveLength(0);
@@ -222,26 +223,26 @@ document.querySelector('#session').innerHTML = [session1,session2,session3].join
222223
const meta2 = await connection.createSession({
223224
userProfile: profile,
224225
});
225-
const core2 = Session.getTab(meta2);
226+
const tab2 = Session.getTab(meta2);
226227

227-
await core2.goto(`${koaServer.baseUrl}/localrestore`);
228-
await core2.waitForLoad('AllContentLoaded');
228+
await tab2.goto(`${koaServer.baseUrl}/localrestore`);
229+
await tab2.waitForLoad('AllContentLoaded');
229230

230-
const localContent = await core2.execJsPath([
231+
const localContent = await tab2.execJsPath([
231232
'document',
232233
['querySelector', '#local'],
233234
'textContent',
234235
]);
235236
expect(localContent.value).toBe('value1,,value3');
236-
const sessionContent = await core2.execJsPath([
237+
const sessionContent = await tab2.execJsPath([
237238
'document',
238239
['querySelector', '#session'],
239240
'textContent',
240241
]);
241242
expect(sessionContent.value).toBe('value1,value2,');
242243

243-
await core.close();
244-
await core2.close();
244+
await tab.close();
245+
await tab2.close();
245246
});
246247

247248
it('should not make requests to end sites during profile "install"', async () => {
@@ -279,7 +280,7 @@ document.querySelector('#session').innerHTML = [session1,session2,session3].join
279280
cookies: [],
280281
},
281282
});
282-
const core = Session.getTab(meta);
283+
const tab = Session.getTab(meta);
283284

284285
koaServer.get('/local-change-pre', ctx => {
285286
ctx.body = `<body>
@@ -300,28 +301,28 @@ document.querySelector('#local').innerHTML = localStorage.getItem('test');
300301
</body>`;
301302
});
302303

303-
await core.goto(`${koaServer.baseUrl}/local-change-pre`);
304-
await core.waitForLoad('AllContentLoaded');
304+
await tab.goto(`${koaServer.baseUrl}/local-change-pre`);
305+
await tab.waitForLoad('AllContentLoaded');
305306

306307
const profile = await connection.exportUserProfile(meta);
307308
expect(profile.storage[koaServer.baseUrl]?.localStorage).toHaveLength(1);
308309
expect(profile.storage[koaServer.baseUrl]?.localStorage[0][1]).toBe('changed');
309310

310-
await core.interact([
311+
await tab.interact([
311312
{
312313
command: InteractionCommand.click,
313314
mousePosition: ['window', 'document', ['querySelector', 'a']],
314315
},
315316
]);
316317

317-
const localContent = await core.execJsPath([
318+
const localContent = await tab.execJsPath([
318319
'document',
319320
['querySelector', '#local'],
320321
'textContent',
321322
]);
322323
expect(localContent.value).toBe('changed');
323324

324-
await core.close();
325+
await tab.close();
325326
});
326327

327328
it('should store cross domain domStorage items', async () => {

examples/ulixee.org.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ import { Handler, Agent } from '@secret-agent/full-client';
1616

1717
handler.dispatchAgent(async agent => {
1818
await agent.goto('https://ulixee.org');
19-
const links = await agent.document.querySelectorAll('a.DatasetSummary');
20-
for (const link of links) {
19+
const datasetLinks = await agent.document.querySelectorAll('a.DatasetSummary');
20+
for (const link of datasetLinks) {
2121
const name = await link.querySelector('.title').textContent;
2222
const href = await link.getAttribute('href');
2323
const dataset = { name, href };

full-client/test/emulate.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { Helpers } from '@secret-agent/testing';
22
import { GlobalPool } from '@secret-agent/core';
33
import { ITestKoaServer } from '@secret-agent/testing/helpers';
4-
import Viewport from '@secret-agent/emulate-browsers-base/lib/Viewport';
4+
import Viewports from '@secret-agent/core/lib/Viewports';
55
import { Handler } from '../index';
66

77
let koaServer: ITestKoaServer;
@@ -117,7 +117,7 @@ describe('setScreensize', () => {
117117
frameBorderHeight: 0,
118118
};
119119
const viewport = Viewports.getDefault(windowFraming, windowFraming);
120-
const agent = await new SecretAgent({
120+
const agent = await handler.createAgent({
121121
viewport,
122122
});
123123
Helpers.needsClosing.push(agent);
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
---
2+
title: 'Scaling SecretAgent Scrapes with Handlers'
3+
path: /handling-scale
4+
date: 2020-12-29
5+
summary: "We needed a simpler approach to scaling out to multiple machines running SecretAgent and 1000s of waiting actions. So we added a new concept called Handlers."
6+
---
7+
8+
When you start using SecretAgent, you often copy and paste the default examples. As we started to use SecretAgent on larger extraction efforts, it became clear that we didn't have a clear story for "how" you go from that starting example to running 2, or even 1000 scrapes.
9+
10+
As you start to think about structuring a bigger effort, a bunch of questions come up:
11+
12+
- Do you create a new SecretAgent instance every time? Or do you simply add tabs?
13+
- How expensive is it to create many instances?
14+
- How should I make sure not to overload the host machine with the number of scrapes running at the same time?
15+
- How do I add new machines when I max out the current one?
16+
17+
As we explored simplifying this story, we wanted to make the progression of "examples" through to full-scrapes a smooth process. Something like this:
18+
19+
#### Step 1: Try Out an Example
20+
21+
Trying out examples should require as little setup as possible, so we added a new `default export` that's a ready-to-go client for SecretAgent.
22+
23+
```js
24+
import agent from 'secret-agent';
25+
26+
(async () => {
27+
// no initilization required!
28+
await agent.goto('https://ulixee.org');
29+
const datasetLinks = await agent.document.querySelectorAll('a.DatasetSummary');
30+
for (const link of datasetLinks) {
31+
const name = await link.querySelector('.title').textContent;
32+
const href = await link.getAttribute('href');
33+
const dataset = { name, href };
34+
console.log('Ulixee Dataset', dataset);
35+
}
36+
37+
await agent.close();
38+
})();
39+
```
40+
41+
#### Step 2: Run Multiple Scrapes
42+
43+
Agent instances are lightweight, but what do you do when you need to queue up thousands of them to run. Until now, you've been on your own to use libraries like `p-queue`, keeping track of promises, or simply waiting and looping.
44+
45+
We introduced a new idea into SecretAgent called a [`Handler`](/docs/basic-interfaces/handler) to help run multiple scrapes in one session. Handlers manage the concurrency of multiple scrapes to ensure your machine doesn't get overloaded and hang. We designed it so your code should require almost no changes to transition to many scrapes.
46+
47+
```js
48+
import { Handler } from 'secret-agent';
49+
50+
(async () => {
51+
const handler = new Handler({ maxConcurrency: 5 });
52+
53+
handler.dispatchAgent(async agent => {
54+
// agent is automatically created for us
55+
await agent.goto('https://ulixee.org');
56+
const datasetLinks = await agent.document.querySelectorAll('a.DatasetSummary');
57+
for (const link of datasetLinks) {
58+
const name = await link.querySelector('.title').textContent;
59+
const href = await link.getAttribute('href');
60+
const dataset = { name, href };
61+
62+
// add a name to each agent so we can find each scrape on Replay
63+
const agentOptions = { name };
64+
handler.dispatchAgent(getDatasetCost, link, agentOptions);
65+
}
66+
});
67+
68+
// only 5 agents will be active at a given time until all are done
69+
await handler.waitForAllDispatches();
70+
await handler.close();
71+
})();
72+
73+
// my data gets passed in once an agent is available
74+
async function getDatasetCost(agent, dataset) {
75+
const { name, href } = dataset;
76+
if (!href.startsWith('http')) href = `https://ulixee.org${href}`;
77+
console.log(href);
78+
await agent.goto(href);
79+
await agent.waitForAllContentLoaded();
80+
const cost = await agent.document.querySelector('.cost .large-text').textContent;
81+
console.log('Cost of %s is %s', dataset.name, cost);
82+
}
83+
```
84+
85+
#### Step 3: Add Scraping Machines
86+
87+
You might find that you need to increase the speed of your scrapes. So the next transition you'll likely want to make is to add remote machines. Handlers are built to round-robin between multiple [`CoreConnections`](/docs/advanced/core-connection).
88+
89+
```js
90+
import { Handler } from 'secret-agent';
91+
92+
(async () => {
93+
const handler = new Handler(
94+
{
95+
maxConcurrency: 5,
96+
host: '192.168.1.1:2300', // fictional remote secret-agent #1
97+
},
98+
{
99+
maxConcurrency: 5,
100+
host: '192.168.1.2:2300', // fictional remote secret-agent #2
101+
},
102+
);
103+
104+
// ... everything else is the same!
105+
106+
handler.dispatchAgent(async agent => {
107+
// agent is automatically created for us
108+
await agent.goto('https://ulixee.org');
109+
...
110+
```
111+
112+
113+
#### Default Exports
114+
115+
To get to this setup, you'll notice some changes in the default exports when you install SecretAgent 1.3.0-alpha.1. The default exports that come out of the `secret-agent` and `@secret-agent/client` packages is now a pre-initialized instance of the `Agent` class (`SecretAgent` was renamed to `Agent`).
116+
117+
[`Handler`](/docs/basic-interfaces/handler) and [`Agent`](/docs/basic-interfaces/agent) are available as exports from both the `secret-agent` and `@secret-agent/client` if you'd like to continue to use those. To customize a "Remote" `SecretAgent` for an [`Agent`](/docs/basic-interfaces/agent), you can create a new instance with a [`coreConnection`](/docs/basic-interfaces/agent#constructor) parameter, or use the [`.configure()`](/docs/basic-interfaces/agent#configure) function.
118+
119+
120+
#### That's it!
121+
122+
That's our change. We hope it leads to a very simple model to understand how to scale up your SecretAgent instances. Feedback is welcome as always on any of our channels (listed in header)!

website/gridsome.config.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,23 @@ module.exports = {
6060
},
6161
},
6262
},
63+
{
64+
use: '@gridsome/vue-remark',
65+
options: {
66+
baseDir: './blog',
67+
pathPrefix: '/blog',
68+
typeName: 'Post',
69+
template: './src/templates/BlogPost.vue',
70+
plugins: ['@gridsome/remark-prismjs'],
71+
remark: {
72+
autolinkHeadings: {
73+
content: {
74+
type: 'text',
75+
value: '#',
76+
},
77+
},
78+
},
79+
},
80+
},
6381
],
6482
};

website/src/layouts/partials/Nav.vue

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,27 @@
33
<g-link to="/docs/">
44
<span class="main-nav__label">Docs</span>
55
</g-link>
6+
<g-link to="/blog/">
7+
<span class="main-nav__label">Blog</span>
8+
</g-link>
69
<g-link to="/why/">
710
<span class="main-nav__label">Why?</span>
811
</g-link>
912
</nav>
1013
</template>
1114

12-
1315
<style lang="scss">
16+
.main-nav {
17+
flex-wrap: nowrap;
18+
}
19+
@media screen and (max-width: 850px) {
1420
.main-nav {
15-
flex-wrap: nowrap;
16-
}
17-
@media screen and (max-width: 850px) {
18-
.main-nav {
19-
order: 10;
20-
min-width: 100%;
21-
a {
22-
padding-top: 5px;
23-
padding-bottom: 10px;
24-
}
21+
order: 10;
22+
min-width: 100%;
23+
a {
24+
padding-top: 5px;
25+
padding-bottom: 10px;
2526
}
2627
}
28+
}
2729
</style>

0 commit comments

Comments
 (0)