C#爬虫-Selenium ChromeDriver 设置代理

背景

开发爬虫程序,如果不做代理设置,本机的外网IP很容易被网站封掉,导致不能持续进行数据抓取。而Selenium作为动态网页抓取的利器,我们有必要了解一下,如何对它进行代理设置,并正常访问网页。

解决办法

1、首先申请代理ip,正常付费的才比较靠谱。这其中包括账号、密码。


 private string proxy_Host = "域名地址"; private int proxy_Post = 端口; private string proxy_UserName = "账号"; private string proxy_PassWord = "密码"; private string proxy_CheckURL = "检查是否正常的地址"; private string Ex_Proxy_Name = "proxy.zip";

2、设置chrome background.jsmanifest.json

 private bool Rebuild_Extension_Proxy(string proxy_UserName, string proxy_PassWord) { bool result = false;
FileStream zipToOpen = ; ZipArchive archive = ; ZipArchiveEntry readmeEntry = ; StreamWriter writer = ; string background = ""; string manifest = "";
try { background = @" var Global = { currentProxyAouth: { username: '', password: '' } }
Global.currentProxyAouth = { username: '" + proxy_UserName + @"', password: '" + proxy_PassWord + @"' }
chrome.webRequest.onAuthRequired.addListener( function(details, callbackFn) { console.log('onAuthRequired >>>: ', details, callbackFn); callbackFn({ authCredentials: Global.currentProxyAouth }); }, { urls: [""<all_urls>""] }, [""asyncBlocking""]);
chrome.runtime.onMessage.addListener( function(request, sender, sendResponse) { console.log('Background recieved a message: ', request);
POPUP_PARAMS = {}; if (request.command && requestHandler[request.command]) requestHandler[request.command] (request); } );";
manifest = @" { ""version"": ""1.0.0"", ""manifest_version"": 2, ""name"": ""Chrome Proxy"", ""permissions"": [ ""proxy"", ""tabs"", ""unlimitedStorage"", ""storage"", ""<all_urls>"", ""webRequest"", ""webRequestBlocking"" ], ""background"": { ""scripts"": [""background.js""] }, ""minimum_chrome_version"":""22.0.0"" }";
zipToOpen = new FileStream(System.Environment.CurrentDirectory + "\\" + Ex_Proxy_Name, FileMode.Create); archive = new ZipArchive(zipToOpen, ZipArchiveMode.Update);
readmeEntry = archive.CreateEntry("background.js"); writer = new StreamWriter(readmeEntry.Open()); writer.WriteLine(background); writer.Close();
readmeEntry = archive.CreateEntry("manifest.json"); writer = new StreamWriter(readmeEntry.Open()); writer.WriteLine(manifest); writer.Close(); result = true; } catch (Exception ex) { result = false; } finally { if (writer != ) { writer.Close(); writer.Dispose(); writer = ; } if (readmeEntry != ) { readmeEntry = ; } if (archive != ) { archive.Dispose(); archive = ; } if (zipToOpen != ) { zipToOpen.Close(); zipToOpen.Dispose(); zipToOpen = ; } }
return result; }

3、Chrome Driver使用代理Proxy

 // 設置 Chrome Driver Exyension Proxy 設定 bool isproxysetting = true; if (_isuseproxy) { isproxysetting = Rebuild_Extension_Proxy(proxy_UserName, proxy_PassWord); }
if (isproxysetting) { // Driver 設定 options = new ChromeOptions(); if (_isuseproxy) { options.Proxy = ; options.AddArguments("--proxy-server=" + proxy_Host + ":" + proxy_Post.ToString()); options.AddExtension(Ex_Proxy_Name); }

4、测试一下我们的设置

 private Proxy_Unit.ProxyIPInfo Get_ProxyIPInfo(string Html_Content) { Proxy_Unit.ProxyIPInfo result = ;
try { result = new Proxy_Unit.ProxyIPInfo();
Html_Content = Html_Content.Replace("<html><head></head><body><pre style=\"word-wrap: break-word; white-space: pre-wrap;\">", ""); Html_Content = Html_Content.Replace("</pre></body></html>", ""); if (!Html_Content.Contains("proxy error")) { result = JsonConvert.DeserializeObject<Proxy_Unit.ProxyIPInfo>(Html_Content); } else { result = ; } } catch (Exception ex) { result = ; }
return result; }

测试效果

成功,达到预期效果

{ "ip":"213.182.205.185", "country":"IS", "asn":{ "asnum":9009, "org_name":"M247 Ltd" }, "geo":{ "city":"Reykjavik", "region":"1", "region_name":"Capital Region", "postal_code":"105", "latitude":64.1369, "longitude":-21.9139, "tz":"Atlantic/Reykjavik", "lum_city":"reykjavik", "lum_region":"1" }}

总结

我们之前测试要为ChromeDriver设定Proxy时有遇到许多困难,需要使用Chrome Extension的管道设定Proxy才成功,以上希望能让您比较好了解。


举报
评论 0