已更新
:
WebBrowser可以在Github上找到基于的控制台Web
scraper 的最新版本。
已更新
:添加
WebBrowser对象池以进行多个并行下载。
您是否有示例说明如何在控制台应用程序中执行此 *** 作?另外,我也不认为webBrowser可以是一个类变量,因为我要为每个变量并行运行整个过程,从而迭代数千个URL
以下是或多或少 WebBrowser
基于通用 Web的scraper的实现
,可作为控制台应用程序使用。这是我以前
WebBrowser相关工作的合并,包括问题中引用的代码:
捕获不透明的网页图像
加载具有动态AJAX内容的页面
创建STA消息循环线程以用于
WebBrowser
依次加载一组URL
使用以下命令打印一组URL
WebBrowser
网页UI自动化
几点:
可重用的
MessageLoopApartment
类用于使用其自己的消息泵来启动和运行WinForms STA线程。可以从 控制台应用程序 使用它,如下所示。此类公开了TPL任务计划程序(FromCurrentSynchronizationContext
)和一组Task.Factory.StartNew
包装程序以使用此任务计划程序。这是在单独的STA线程上
async/await
运行WebBrowser
导航任务的绝佳工具。这样,WebBrowser
就可以在该线程上创建,导航和销毁对象。虽然,MessageLoopApartment
不是WebBrowser
专门捆绑的。使用浏览器功能控件启用HTML5渲染很重要,因为
WebBrowser
默认情况下,对象在IE7仿真模式下运行。这就是SetFeatureBrowserEmulation
下面的内容。不一定总是可以确定网页何时以100%的概率完成渲染。一些页面非常复杂,并使用连续的AJAX更新。但是,通过
documentCompleted
首先处理事件,然后轮询页面的当前HTML快照以查找更改并检查WebBrowser.IsBusy
属性,我们可以非常接近。这就是NavigateAsync
下面的内容。如果页面渲染永无休止(请注意
CancellationTokenSource
和CreatelinkedTokenSource
),则上述内容中会出现超时逻辑。using Microsoft.Win32;
using System;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;namespace Console_22239357
{
class Program
{
// by Noseratio - https://stackoverflow.com/a/22262976/1768303// main logic static async Task ScrapSitesAsync(string[] urls, CancellationToken token) { using (var apartment = new MessageLoopApartment()) { // create WebBrowser inside MessageLoopApartment var webBrowser = apartment.Invoke(() => new WebBrowser()); try { foreach (var url in urls) { Console.WriteLine("URL:n" + url); // cancel in 30s or when the main token is signalled var navigationCts = CancellationTokenSource.CreatelinkedTokenSource(token); navigationCts.CancelAfter((int)TimeSpan.FromSeconds(30).TotalMilliseconds); var navigationToken = navigationCts.Token; // run the navigation task inside MessageLoopApartment string html = await apartment.Run(() => webBrowser.NavigateAsync(url, navigationToken), navigationToken); Console.WriteLine("HTML:n" + html); } } finally { // dispose of WebBrowser inside MessageLoopApartment apartment.Invoke(() => webBrowser.Dispose()); } } } // entry point static void Main(string[] args) { try { WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5 var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds); var task = ScrapSitesAsync( new[] { "http://example.com", "http://example.org", "http://example.net" }, cts.Token); task.Wait(); Console.WriteLine("Press Enter to exit..."); Console.ReadLine(); } catch (Exception ex) { while (ex is AggregateException && ex.InnerException != null) ex = ex.InnerException; Console.WriteLine(ex.Message); Environment.Exit(-1); } }}/// <summary>/// WebBrowserExt - WebBrowser extensions/// by Noseratio - https://stackoverflow.com/a/22262976/1768303/// </summary>public static class WebBrowserExt{ const int POLL_DELAY = 500; // navigate and download public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token) { // navigate and await documentCompleted var tcs = new TaskCompletionSource<bool>(); WebBrowserdocumentCompletedEventHandler handler = (s, arg) => tcs.TrySetResult(true); using (token.Register(() => tcs.TrySetCanceled(), useSynchronizationContext: true)) { webBrowser.documentCompleted += handler; try { webBrowser.Navigate(url); await tcs.Task; // wait for documentCompleted } finally { webBrowser.documentCompleted -= handler; } } // get the root element var documentElement = webBrowser.document.GetElementsByTagName("html")[0]; // poll the current HTML for changes asynchronosly var html = documentElement.OuterHtml; while (true) { // wait asynchronously, this will throw if cancellation requested await Task.Delay(POLL_DELAY, token); // continue polling if the WebBrowser is still busy if (webBrowser.IsBusy) continue; var htmlNow = documentElement.OuterHtml; if (html == htmlNow) break; // no changes detected, end the poll loop html = htmlNow; } // consider the page fully rendered token.ThrowIfCancellationRequested(); return html; } // enable HTML5 (assuming we're running IE10+) // more info: https://stackoverflow.com/a/18333982/1768303 public static void SetFeatureBrowserEmulation() { if (System.ComponentModel.LicenseManager.UsageMode != System.ComponentModel.LicenseUsageMode.Runtime) return; var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName); Registry.SetValue(@"HKEY_CURRENT_USERSoftwareMicrosoftInternet ExplorerMainFeatureControlFEATURE_BROWSER_EMULATION", appName, 10000, RegistryValueKind.DWord); }}/// <summary>/// MessageLoopApartment/// STA thread with message pump for serial execution of tasks/// by Noseratio - https://stackoverflow.com/a/22262976/1768303/// </summary>public class MessageLoopApartment : IDisposable{ Thread _thread; // the STA thread TaskScheduler _taskScheduler; // the STA thread's task scheduler public TaskScheduler TaskScheduler { get { return _taskScheduler; } } /// <summary>MessageLoopApartment constructor</summary> public MessageLoopApartment() { var tcs = new TaskCompletionSource<TaskScheduler>(); // start an STA thread and gets a task scheduler _thread = new Thread(startArg => { EventHandler idleHandler = null; idleHandler = (s, e) => { // handle Application.Idle just once Application.Idle -= idleHandler; // return the task scheduler tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext()); }; // handle Application.Idle just once // to make sure we're inside the message loop // and SynchronizationContext has been correctly installed Application.Idle += idleHandler; Application.Run(); }); _thread.SetApartmentState(ApartmentState.STA); _thread.IsBackground = true; _thread.Start(); _taskScheduler = tcs.Task.Result; } /// <summary>shutdown the STA thread</summary> public void Dispose() { if (_taskScheduler != null) { var taskScheduler = _taskScheduler; _taskScheduler = null; // execute Application.ExitThread() on the STA thread Task.Factory.StartNew( () => Application.ExitThread(), CancellationToken.None, TaskCreationOptions.None, taskScheduler).Wait(); _thread.Join(); _thread = null; } } /// <summary>Task.Factory.StartNew wrappers</summary> public void Invoke(Action action) { Task.Factory.StartNew(action, CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait(); } public TResult Invoke<TResult>(Func<TResult> action) { return Task.Factory.StartNew(action, CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result; } public Task Run(Action action, CancellationToken token) { return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler); } public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token) { return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler); } public Task Run(Func<Task> action, CancellationToken token) { return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap(); } public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token) { return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap(); }}
}
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)